1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelLowering.h"
15 #include "MCTargetDesc/X86ShuffleDecode.h"
16 #include "X86.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/BlockFrequencyInfo.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/Analysis/ObjCARCUtil.h"
32 #include "llvm/Analysis/ProfileSummaryInfo.h"
33 #include "llvm/Analysis/VectorUtils.h"
34 #include "llvm/CodeGen/IntrinsicLowering.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineInstrBuilder.h"
38 #include "llvm/CodeGen/MachineJumpTableInfo.h"
39 #include "llvm/CodeGen/MachineLoopInfo.h"
40 #include "llvm/CodeGen/MachineModuleInfo.h"
41 #include "llvm/CodeGen/MachineRegisterInfo.h"
42 #include "llvm/CodeGen/TargetLowering.h"
43 #include "llvm/CodeGen/WinEHFuncInfo.h"
44 #include "llvm/IR/CallingConv.h"
45 #include "llvm/IR/Constants.h"
46 #include "llvm/IR/DerivedTypes.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/GlobalAlias.h"
50 #include "llvm/IR/GlobalVariable.h"
51 #include "llvm/IR/IRBuilder.h"
52 #include "llvm/IR/Instructions.h"
53 #include "llvm/IR/Intrinsics.h"
54 #include "llvm/IR/PatternMatch.h"
55 #include "llvm/MC/MCAsmInfo.h"
56 #include "llvm/MC/MCContext.h"
57 #include "llvm/MC/MCExpr.h"
58 #include "llvm/MC/MCSymbol.h"
59 #include "llvm/Support/CommandLine.h"
60 #include "llvm/Support/Debug.h"
61 #include "llvm/Support/ErrorHandling.h"
62 #include "llvm/Support/KnownBits.h"
63 #include "llvm/Support/MathExtras.h"
64 #include "llvm/Target/TargetOptions.h"
65 #include <algorithm>
66 #include <bitset>
67 #include <cctype>
68 #include <numeric>
69 using namespace llvm;
70 
71 #define DEBUG_TYPE "x86-isel"
72 
73 STATISTIC(NumTailCalls, "Number of tail calls");
74 
75 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77     cl::desc(
78         "Sets the preferable loop alignment for experiments (as log2 bytes) "
79         "for innermost loops only. If specified, this option overrides "
80         "alignment set by x86-experimental-pref-loop-alignment."),
81     cl::Hidden);
82 
83 static cl::opt<bool> MulConstantOptimization(
84     "mul-constant-optimization", cl::init(true),
85     cl::desc("Replace 'mul x, Const' with more effective instructions like "
86              "SHIFT, LEA, etc."),
87     cl::Hidden);
88 
89 static cl::opt<bool> ExperimentalUnorderedISEL(
90     "x86-experimental-unordered-atomic-isel", cl::init(false),
91     cl::desc("Use LoadSDNode and StoreSDNode instead of "
92              "AtomicSDNode for unordered atomic loads and "
93              "stores respectively."),
94     cl::Hidden);
95 
96 /// Call this when the user attempts to do something unsupported, like
97 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98 /// report_fatal_error, so calling code should attempt to recover without
99 /// crashing.
100 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101                              const char *Msg) {
102   MachineFunction &MF = DAG.getMachineFunction();
103   DAG.getContext()->diagnose(
104       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105 }
106 
107 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
108                                      const X86Subtarget &STI)
109     : TargetLowering(TM), Subtarget(STI) {
110   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111   X86ScalarSSEf64 = Subtarget.hasSSE2();
112   X86ScalarSSEf32 = Subtarget.hasSSE1();
113   X86ScalarSSEf16 = Subtarget.hasFP16();
114   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
115 
116   // Set up the TargetLowering object.
117 
118   // X86 is weird. It always uses i8 for shift amounts and setcc results.
119   setBooleanContents(ZeroOrOneBooleanContent);
120   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
121   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
122 
123   // For 64-bit, since we have so many registers, use the ILP scheduler.
124   // For 32-bit, use the register pressure specific scheduling.
125   // For Atom, always use ILP scheduling.
126   if (Subtarget.isAtom())
127     setSchedulingPreference(Sched::ILP);
128   else if (Subtarget.is64Bit())
129     setSchedulingPreference(Sched::ILP);
130   else
131     setSchedulingPreference(Sched::RegPressure);
132   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
133   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
134 
135   // Bypass expensive divides and use cheaper ones.
136   if (TM.getOptLevel() >= CodeGenOpt::Default) {
137     if (Subtarget.hasSlowDivide32())
138       addBypassSlowDiv(32, 8);
139     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
140       addBypassSlowDiv(64, 32);
141   }
142 
143   // Setup Windows compiler runtime calls.
144   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
145     static const struct {
146       const RTLIB::Libcall Op;
147       const char * const Name;
148       const CallingConv::ID CC;
149     } LibraryCalls[] = {
150       { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
151       { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
152       { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
153       { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
154       { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
155     };
156 
157     for (const auto &LC : LibraryCalls) {
158       setLibcallName(LC.Op, LC.Name);
159       setLibcallCallingConv(LC.Op, LC.CC);
160     }
161   }
162 
163   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
164     // MSVCRT doesn't have powi; fall back to pow
165     setLibcallName(RTLIB::POWI_F32, nullptr);
166     setLibcallName(RTLIB::POWI_F64, nullptr);
167   }
168 
169   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
170   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
171   // FIXME: Should we be limiting the atomic size on other configs? Default is
172   // 1024.
173   if (!Subtarget.hasCmpxchg8b())
174     setMaxAtomicSizeInBitsSupported(32);
175 
176   // Set up the register classes.
177   addRegisterClass(MVT::i8, &X86::GR8RegClass);
178   addRegisterClass(MVT::i16, &X86::GR16RegClass);
179   addRegisterClass(MVT::i32, &X86::GR32RegClass);
180   if (Subtarget.is64Bit())
181     addRegisterClass(MVT::i64, &X86::GR64RegClass);
182 
183   for (MVT VT : MVT::integer_valuetypes())
184     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
185 
186   // We don't accept any truncstore of integer registers.
187   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
188   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
189   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
190   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
191   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
192   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
193 
194   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
195 
196   // SETOEQ and SETUNE require checking two conditions.
197   for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
198     setCondCodeAction(ISD::SETOEQ, VT, Expand);
199     setCondCodeAction(ISD::SETUNE, VT, Expand);
200   }
201 
202   // Integer absolute.
203   if (Subtarget.hasCMov()) {
204     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
205     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
206     if (Subtarget.is64Bit())
207       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
208   }
209 
210   // Signed saturation subtraction.
211   setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
212   setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
213   setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
214   if (Subtarget.is64Bit())
215     setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
216 
217   // Funnel shifts.
218   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
219     // For slow shld targets we only lower for code size.
220     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
221 
222     setOperationAction(ShiftOp             , MVT::i8   , Custom);
223     setOperationAction(ShiftOp             , MVT::i16  , Custom);
224     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
225     if (Subtarget.is64Bit())
226       setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
227   }
228 
229   if (!Subtarget.useSoftFloat()) {
230     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
231     // operation.
232     setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
233     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
234     setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
235     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
236     // We have an algorithm for SSE2, and we turn this into a 64-bit
237     // FILD or VCVTUSI2SS/SD for other targets.
238     setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
239     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
240     // We have an algorithm for SSE2->double, and we turn this into a
241     // 64-bit FILD followed by conditional FADD for other targets.
242     setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
243     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
244 
245     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
246     // this operation.
247     setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
248     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
249     // SSE has no i16 to fp conversion, only i32. We promote in the handler
250     // to allow f80 to use i16 and f64 to use i16 with sse1 only
251     setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
252     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
253     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
254     setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
255     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
256     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
257     // are Legal, f80 is custom lowered.
258     setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
259     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
260 
261     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
262     // this operation.
263     setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
264     // FIXME: This doesn't generate invalid exception when it should. PR44019.
265     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
266     setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
267     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
268     setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
269     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
270     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
271     // are Legal, f80 is custom lowered.
272     setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
273     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
274 
275     // Handle FP_TO_UINT by promoting the destination to a larger signed
276     // conversion.
277     setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
278     // FIXME: This doesn't generate invalid exception when it should. PR44019.
279     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
280     setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
281     // FIXME: This doesn't generate invalid exception when it should. PR44019.
282     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
283     setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
284     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
285     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
286     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
287 
288     setOperationAction(ISD::LRINT,             MVT::f32, Custom);
289     setOperationAction(ISD::LRINT,             MVT::f64, Custom);
290     setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
291     setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
292 
293     if (!Subtarget.is64Bit()) {
294       setOperationAction(ISD::LRINT,  MVT::i64, Custom);
295       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
296     }
297   }
298 
299   if (Subtarget.hasSSE2()) {
300     // Custom lowering for saturating float to int conversions.
301     // We handle promotion to larger result types manually.
302     for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
303       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
304       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
305     }
306     if (Subtarget.is64Bit()) {
307       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
308       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
309     }
310   }
311 
312   // Handle address space casts between mixed sized pointers.
313   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
314   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
315 
316   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
317   if (!X86ScalarSSEf64) {
318     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
319     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
320     if (Subtarget.is64Bit()) {
321       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
322       // Without SSE, i64->f64 goes through memory.
323       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
324     }
325   } else if (!Subtarget.is64Bit())
326     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
327 
328   // Scalar integer divide and remainder are lowered to use operations that
329   // produce two results, to match the available instructions. This exposes
330   // the two-result form to trivial CSE, which is able to combine x/y and x%y
331   // into a single instruction.
332   //
333   // Scalar integer multiply-high is also lowered to use two-result
334   // operations, to match the available instructions. However, plain multiply
335   // (low) operations are left as Legal, as there are single-result
336   // instructions for this in x86. Using the two-result multiply instructions
337   // when both high and low results are needed must be arranged by dagcombine.
338   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
339     setOperationAction(ISD::MULHS, VT, Expand);
340     setOperationAction(ISD::MULHU, VT, Expand);
341     setOperationAction(ISD::SDIV, VT, Expand);
342     setOperationAction(ISD::UDIV, VT, Expand);
343     setOperationAction(ISD::SREM, VT, Expand);
344     setOperationAction(ISD::UREM, VT, Expand);
345   }
346 
347   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
348   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
349   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
350                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
351     setOperationAction(ISD::BR_CC,     VT, Expand);
352     setOperationAction(ISD::SELECT_CC, VT, Expand);
353   }
354   if (Subtarget.is64Bit())
355     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
356   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
357   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
358   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
359 
360   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
361   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
362   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
363   setOperationAction(ISD::FREM             , MVT::f128 , Expand);
364 
365   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
366     setOperationAction(ISD::FLT_ROUNDS_    , MVT::i32  , Custom);
367     setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
368   }
369 
370   // Promote the i8 variants and force them on up to i32 which has a shorter
371   // encoding.
372   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
373   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
374 
375   if (Subtarget.hasBMI()) {
376     // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
377     // is enabled.
378     setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
379   } else {
380     setOperationAction(ISD::CTTZ, MVT::i16, Custom);
381     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
382     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
383     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
384     if (Subtarget.is64Bit()) {
385       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
386       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
387     }
388   }
389 
390   if (Subtarget.hasLZCNT()) {
391     // When promoting the i8 variants, force them to i32 for a shorter
392     // encoding.
393     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
394     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
395   } else {
396     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
397       if (VT == MVT::i64 && !Subtarget.is64Bit())
398         continue;
399       setOperationAction(ISD::CTLZ           , VT, Custom);
400       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
401     }
402   }
403 
404   for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
405                   ISD::STRICT_FP_TO_FP16}) {
406     // Special handling for half-precision floating point conversions.
407     // If we don't have F16C support, then lower half float conversions
408     // into library calls.
409     setOperationAction(
410         Op, MVT::f32,
411         (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
412     // There's never any support for operations beyond MVT::f32.
413     setOperationAction(Op, MVT::f64, Expand);
414     setOperationAction(Op, MVT::f80, Expand);
415     setOperationAction(Op, MVT::f128, Expand);
416   }
417 
418   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
419   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
420   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
421   setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
422   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
423   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
424   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
425   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
426 
427   setOperationAction(ISD::PARITY, MVT::i8, Custom);
428   setOperationAction(ISD::PARITY, MVT::i16, Custom);
429   setOperationAction(ISD::PARITY, MVT::i32, Custom);
430   if (Subtarget.is64Bit())
431     setOperationAction(ISD::PARITY, MVT::i64, Custom);
432   if (Subtarget.hasPOPCNT()) {
433     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
434     // popcntw is longer to encode than popcntl and also has a false dependency
435     // on the dest that popcntl hasn't had since Cannon Lake.
436     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
437   } else {
438     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
439     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
440     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
441     if (Subtarget.is64Bit())
442       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
443     else
444       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
445   }
446 
447   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
448 
449   if (!Subtarget.hasMOVBE())
450     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
451 
452   // X86 wants to expand cmov itself.
453   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
454     setOperationAction(ISD::SELECT, VT, Custom);
455     setOperationAction(ISD::SETCC, VT, Custom);
456     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
457     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
458   }
459   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
460     if (VT == MVT::i64 && !Subtarget.is64Bit())
461       continue;
462     setOperationAction(ISD::SELECT, VT, Custom);
463     setOperationAction(ISD::SETCC,  VT, Custom);
464   }
465 
466   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
467   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
468   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
469 
470   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
471   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
472   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
473   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
474   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
475   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
476   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
477     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
478 
479   // Darwin ABI issue.
480   for (auto VT : { MVT::i32, MVT::i64 }) {
481     if (VT == MVT::i64 && !Subtarget.is64Bit())
482       continue;
483     setOperationAction(ISD::ConstantPool    , VT, Custom);
484     setOperationAction(ISD::JumpTable       , VT, Custom);
485     setOperationAction(ISD::GlobalAddress   , VT, Custom);
486     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
487     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
488     setOperationAction(ISD::BlockAddress    , VT, Custom);
489   }
490 
491   // 64-bit shl, sra, srl (iff 32-bit x86)
492   for (auto VT : { MVT::i32, MVT::i64 }) {
493     if (VT == MVT::i64 && !Subtarget.is64Bit())
494       continue;
495     setOperationAction(ISD::SHL_PARTS, VT, Custom);
496     setOperationAction(ISD::SRA_PARTS, VT, Custom);
497     setOperationAction(ISD::SRL_PARTS, VT, Custom);
498   }
499 
500   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
501     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
502 
503   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
504 
505   // Expand certain atomics
506   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
507     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
508     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
509     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
510     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
511     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
512     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
513     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
514   }
515 
516   if (!Subtarget.is64Bit())
517     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
518 
519   if (Subtarget.hasCmpxchg16b()) {
520     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
521   }
522 
523   // FIXME - use subtarget debug flags
524   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
525       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
526       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
527     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
528   }
529 
530   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
531   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
532 
533   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
534   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
535 
536   setOperationAction(ISD::TRAP, MVT::Other, Legal);
537   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
538   if (Subtarget.getTargetTriple().isPS4CPU())
539     setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
540   else
541     setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
542 
543   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
544   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
545   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
546   bool Is64Bit = Subtarget.is64Bit();
547   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
548   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
549 
550   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
551   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
552 
553   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
554 
555   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
556   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
557   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
558 
559   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
560     // f32 and f64 use SSE.
561     // Set up the FP register classes.
562     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
563                                                      : &X86::FR32RegClass);
564     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
565                                                      : &X86::FR64RegClass);
566 
567     // Disable f32->f64 extload as we can only generate this in one instruction
568     // under optsize. So its easier to pattern match (fpext (load)) for that
569     // case instead of needing to emit 2 instructions for extload in the
570     // non-optsize case.
571     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
572 
573     for (auto VT : { MVT::f32, MVT::f64 }) {
574       // Use ANDPD to simulate FABS.
575       setOperationAction(ISD::FABS, VT, Custom);
576 
577       // Use XORP to simulate FNEG.
578       setOperationAction(ISD::FNEG, VT, Custom);
579 
580       // Use ANDPD and ORPD to simulate FCOPYSIGN.
581       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
582 
583       // These might be better off as horizontal vector ops.
584       setOperationAction(ISD::FADD, VT, Custom);
585       setOperationAction(ISD::FSUB, VT, Custom);
586 
587       // We don't support sin/cos/fmod
588       setOperationAction(ISD::FSIN   , VT, Expand);
589       setOperationAction(ISD::FCOS   , VT, Expand);
590       setOperationAction(ISD::FSINCOS, VT, Expand);
591     }
592 
593     // Lower this to MOVMSK plus an AND.
594     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
595     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
596 
597   } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
598              (UseX87 || Is64Bit)) {
599     // Use SSE for f32, x87 for f64.
600     // Set up the FP register classes.
601     addRegisterClass(MVT::f32, &X86::FR32RegClass);
602     if (UseX87)
603       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
604 
605     // Use ANDPS to simulate FABS.
606     setOperationAction(ISD::FABS , MVT::f32, Custom);
607 
608     // Use XORP to simulate FNEG.
609     setOperationAction(ISD::FNEG , MVT::f32, Custom);
610 
611     if (UseX87)
612       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
613 
614     // Use ANDPS and ORPS to simulate FCOPYSIGN.
615     if (UseX87)
616       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
617     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
618 
619     // We don't support sin/cos/fmod
620     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
621     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
622     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
623 
624     if (UseX87) {
625       // Always expand sin/cos functions even though x87 has an instruction.
626       setOperationAction(ISD::FSIN, MVT::f64, Expand);
627       setOperationAction(ISD::FCOS, MVT::f64, Expand);
628       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
629     }
630   } else if (UseX87) {
631     // f32 and f64 in x87.
632     // Set up the FP register classes.
633     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
634     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
635 
636     for (auto VT : { MVT::f32, MVT::f64 }) {
637       setOperationAction(ISD::UNDEF,     VT, Expand);
638       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
639 
640       // Always expand sin/cos functions even though x87 has an instruction.
641       setOperationAction(ISD::FSIN   , VT, Expand);
642       setOperationAction(ISD::FCOS   , VT, Expand);
643       setOperationAction(ISD::FSINCOS, VT, Expand);
644     }
645   }
646 
647   // Expand FP32 immediates into loads from the stack, save special cases.
648   if (isTypeLegal(MVT::f32)) {
649     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
650       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
651       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
652       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
653       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
654     } else // SSE immediates.
655       addLegalFPImmediate(APFloat(+0.0f)); // xorps
656   }
657   // Expand FP64 immediates into loads from the stack, save special cases.
658   if (isTypeLegal(MVT::f64)) {
659     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
660       addLegalFPImmediate(APFloat(+0.0)); // FLD0
661       addLegalFPImmediate(APFloat(+1.0)); // FLD1
662       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
663       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
664     } else // SSE immediates.
665       addLegalFPImmediate(APFloat(+0.0)); // xorpd
666   }
667   // Handle constrained floating-point operations of scalar.
668   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
669   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
670   setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
671   setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
672   setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
673   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
674   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
675   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
676   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
677   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
678   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
679   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
680   setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
681 
682   // We don't support FMA.
683   setOperationAction(ISD::FMA, MVT::f64, Expand);
684   setOperationAction(ISD::FMA, MVT::f32, Expand);
685 
686   // f80 always uses X87.
687   if (UseX87) {
688     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
689     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
690     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
691     {
692       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
693       addLegalFPImmediate(TmpFlt);  // FLD0
694       TmpFlt.changeSign();
695       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
696 
697       bool ignored;
698       APFloat TmpFlt2(+1.0);
699       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
700                       &ignored);
701       addLegalFPImmediate(TmpFlt2);  // FLD1
702       TmpFlt2.changeSign();
703       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
704     }
705 
706     // Always expand sin/cos functions even though x87 has an instruction.
707     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
708     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
709     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
710 
711     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
712     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
713     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
714     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
715     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
716     setOperationAction(ISD::FMA, MVT::f80, Expand);
717     setOperationAction(ISD::LROUND, MVT::f80, Expand);
718     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
719     setOperationAction(ISD::LRINT, MVT::f80, Custom);
720     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
721 
722     // Handle constrained floating-point operations of scalar.
723     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
724     setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
725     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
726     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
727     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
728     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
729     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
730     // as Custom.
731     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
732   }
733 
734   // f128 uses xmm registers, but most operations require libcalls.
735   if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
736     addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
737                                                    : &X86::VR128RegClass);
738 
739     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
740 
741     setOperationAction(ISD::FADD,        MVT::f128, LibCall);
742     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
743     setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
744     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
745     setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
746     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
747     setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
748     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
749     setOperationAction(ISD::FMA,         MVT::f128, LibCall);
750     setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
751 
752     setOperationAction(ISD::FABS, MVT::f128, Custom);
753     setOperationAction(ISD::FNEG, MVT::f128, Custom);
754     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
755 
756     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
757     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
758     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
759     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
760     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
761     // No STRICT_FSINCOS
762     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
763     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
764 
765     setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
766     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
767     // We need to custom handle any FP_ROUND with an f128 input, but
768     // LegalizeDAG uses the result type to know when to run a custom handler.
769     // So we have to list all legal floating point result types here.
770     if (isTypeLegal(MVT::f32)) {
771       setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
772       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
773     }
774     if (isTypeLegal(MVT::f64)) {
775       setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
776       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
777     }
778     if (isTypeLegal(MVT::f80)) {
779       setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
780       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
781     }
782 
783     setOperationAction(ISD::SETCC, MVT::f128, Custom);
784 
785     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
786     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
787     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
788     setTruncStoreAction(MVT::f128, MVT::f32, Expand);
789     setTruncStoreAction(MVT::f128, MVT::f64, Expand);
790     setTruncStoreAction(MVT::f128, MVT::f80, Expand);
791   }
792 
793   // Always use a library call for pow.
794   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
795   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
796   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
797   setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
798 
799   setOperationAction(ISD::FLOG, MVT::f80, Expand);
800   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
801   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
802   setOperationAction(ISD::FEXP, MVT::f80, Expand);
803   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
804   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
805   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
806 
807   // Some FP actions are always expanded for vector types.
808   for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
809                    MVT::v4f32, MVT::v8f32,  MVT::v16f32,
810                    MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {
811     setOperationAction(ISD::FSIN,      VT, Expand);
812     setOperationAction(ISD::FSINCOS,   VT, Expand);
813     setOperationAction(ISD::FCOS,      VT, Expand);
814     setOperationAction(ISD::FREM,      VT, Expand);
815     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
816     setOperationAction(ISD::FPOW,      VT, Expand);
817     setOperationAction(ISD::FLOG,      VT, Expand);
818     setOperationAction(ISD::FLOG2,     VT, Expand);
819     setOperationAction(ISD::FLOG10,    VT, Expand);
820     setOperationAction(ISD::FEXP,      VT, Expand);
821     setOperationAction(ISD::FEXP2,     VT, Expand);
822   }
823 
824   // First set operation action for all vector types to either promote
825   // (for widening) or expand (for scalarization). Then we will selectively
826   // turn on ones that can be effectively codegen'd.
827   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
828     setOperationAction(ISD::SDIV, VT, Expand);
829     setOperationAction(ISD::UDIV, VT, Expand);
830     setOperationAction(ISD::SREM, VT, Expand);
831     setOperationAction(ISD::UREM, VT, Expand);
832     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
833     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
834     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
835     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
836     setOperationAction(ISD::FMA,  VT, Expand);
837     setOperationAction(ISD::FFLOOR, VT, Expand);
838     setOperationAction(ISD::FCEIL, VT, Expand);
839     setOperationAction(ISD::FTRUNC, VT, Expand);
840     setOperationAction(ISD::FRINT, VT, Expand);
841     setOperationAction(ISD::FNEARBYINT, VT, Expand);
842     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
843     setOperationAction(ISD::MULHS, VT, Expand);
844     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
845     setOperationAction(ISD::MULHU, VT, Expand);
846     setOperationAction(ISD::SDIVREM, VT, Expand);
847     setOperationAction(ISD::UDIVREM, VT, Expand);
848     setOperationAction(ISD::CTPOP, VT, Expand);
849     setOperationAction(ISD::CTTZ, VT, Expand);
850     setOperationAction(ISD::CTLZ, VT, Expand);
851     setOperationAction(ISD::ROTL, VT, Expand);
852     setOperationAction(ISD::ROTR, VT, Expand);
853     setOperationAction(ISD::BSWAP, VT, Expand);
854     setOperationAction(ISD::SETCC, VT, Expand);
855     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
856     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
857     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
858     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
859     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
860     setOperationAction(ISD::TRUNCATE, VT, Expand);
861     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
862     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
863     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
864     setOperationAction(ISD::SELECT_CC, VT, Expand);
865     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
866       setTruncStoreAction(InnerVT, VT, Expand);
867 
868       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
869       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
870 
871       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
872       // types, we have to deal with them whether we ask for Expansion or not.
873       // Setting Expand causes its own optimisation problems though, so leave
874       // them legal.
875       if (VT.getVectorElementType() == MVT::i1)
876         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
877 
878       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
879       // split/scalarized right now.
880       if (VT.getVectorElementType() == MVT::f16)
881         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
882     }
883   }
884 
885   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
886   // with -msoft-float, disable use of MMX as well.
887   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
888     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
889     // No operations on x86mmx supported, everything uses intrinsics.
890   }
891 
892   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
893     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
894                                                     : &X86::VR128RegClass);
895 
896     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
897     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
898     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
899     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
900     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
901     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
902     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
903     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
904 
905     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
906     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
907 
908     setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
909     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
910     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
911     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
912     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
913   }
914 
915   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
916     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
917                                                     : &X86::VR128RegClass);
918 
919     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
920     // registers cannot be used even for integer operations.
921     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
922                                                     : &X86::VR128RegClass);
923     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
924                                                     : &X86::VR128RegClass);
925     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
926                                                     : &X86::VR128RegClass);
927     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
928                                                     : &X86::VR128RegClass);
929 
930     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
931                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
932       setOperationAction(ISD::SDIV, VT, Custom);
933       setOperationAction(ISD::SREM, VT, Custom);
934       setOperationAction(ISD::UDIV, VT, Custom);
935       setOperationAction(ISD::UREM, VT, Custom);
936     }
937 
938     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
939     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
940     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
941 
942     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
943     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
944     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
945     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
946     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
947     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
948     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
949     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
950     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
951     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
952 
953     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
954     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
955 
956     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
957     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
958     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
959 
960     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
961       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
962       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
963       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
964       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
965     }
966 
967     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
968     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
969     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
970     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
971     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
972     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
973     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
974     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
975     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
976     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
977 
978     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
979     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
980     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
981     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
982 
983     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
984       setOperationAction(ISD::SETCC,              VT, Custom);
985       setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
986       setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
987       setOperationAction(ISD::CTPOP,              VT, Custom);
988       setOperationAction(ISD::ABS,                VT, Custom);
989 
990       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
991       // setcc all the way to isel and prefer SETGT in some isel patterns.
992       setCondCodeAction(ISD::SETLT, VT, Custom);
993       setCondCodeAction(ISD::SETLE, VT, Custom);
994     }
995 
996     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
997       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
998       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
999       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1000       setOperationAction(ISD::VSELECT,            VT, Custom);
1001       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1002     }
1003 
1004     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
1005       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1006       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1007       setOperationAction(ISD::VSELECT,            VT, Custom);
1008 
1009       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1010         continue;
1011 
1012       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1013       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1014     }
1015 
1016     // Custom lower v2i64 and v2f64 selects.
1017     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1018     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1019     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1020     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1021     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1022 
1023     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1024     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
1025     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1026     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1027     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
1028     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1029 
1030     // Custom legalize these to avoid over promotion or custom promotion.
1031     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1032       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1033       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1034       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1035       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1036     }
1037 
1038     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1039     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Legal);
1040     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1041     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1042 
1043     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1044     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1045 
1046     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1047     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1048 
1049     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1050     setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1051     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1052     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1053     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1054 
1055     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1056     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1057     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1058     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1059 
1060     // We want to legalize this to an f64 load rather than an i64 load on
1061     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1062     // store.
1063     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1064     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1065     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1066     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1067     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1068     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1069 
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073     if (!Subtarget.hasAVX512())
1074       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1075 
1076     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1077     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1078     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1079 
1080     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1081 
1082     setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1083     setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1084     setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1085     setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1086     setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1087     setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1088 
1089     // In the customized shift lowering, the legal v4i32/v2i64 cases
1090     // in AVX2 will be recognized.
1091     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1092       setOperationAction(ISD::SRL,              VT, Custom);
1093       setOperationAction(ISD::SHL,              VT, Custom);
1094       setOperationAction(ISD::SRA,              VT, Custom);
1095       if (VT == MVT::v2i64) continue;
1096       setOperationAction(ISD::ROTL,             VT, Custom);
1097       setOperationAction(ISD::ROTR,             VT, Custom);
1098       setOperationAction(ISD::FSHL,             VT, Custom);
1099       setOperationAction(ISD::FSHR,             VT, Custom);
1100     }
1101 
1102     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1103     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1104     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1105     setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1106     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1107   }
1108 
1109   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1110     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1111     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1112     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1113     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1114     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1115     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1116     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1117     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1118 
1119     // These might be better off as horizontal vector ops.
1120     setOperationAction(ISD::ADD,                MVT::i16, Custom);
1121     setOperationAction(ISD::ADD,                MVT::i32, Custom);
1122     setOperationAction(ISD::SUB,                MVT::i16, Custom);
1123     setOperationAction(ISD::SUB,                MVT::i32, Custom);
1124   }
1125 
1126   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1127     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1128       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1129       setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1130       setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1131       setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1132       setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1133       setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1134       setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1135       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1136       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1137       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1138       setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1139       setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1140 
1141       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1142     }
1143 
1144     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1145     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1146     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1147     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1148     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1149     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1150     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1151     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1152 
1153     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1154     setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
1155     setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
1156 
1157     // FIXME: Do we need to handle scalar-to-vector here?
1158     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1159 
1160     // We directly match byte blends in the backend as they match the VSELECT
1161     // condition form.
1162     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1163 
1164     // SSE41 brings specific instructions for doing vector sign extend even in
1165     // cases where we don't have SRA.
1166     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1167       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1168       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1169     }
1170 
1171     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1172     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1173       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1174       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1175       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1176       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1177       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1178       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1179     }
1180 
1181     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1182       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1183       // do the pre and post work in the vector domain.
1184       setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1185       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1186       // We need to mark SINT_TO_FP as Custom even though we want to expand it
1187       // so that DAG combine doesn't try to turn it into uint_to_fp.
1188       setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1189       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1190     }
1191   }
1192 
1193   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1194     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1195   }
1196 
1197   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1198     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1199                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1200       setOperationAction(ISD::ROTL, VT, Custom);
1201       setOperationAction(ISD::ROTR, VT, Custom);
1202     }
1203 
1204     // XOP can efficiently perform BITREVERSE with VPPERM.
1205     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1206       setOperationAction(ISD::BITREVERSE, VT, Custom);
1207 
1208     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1209                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1210       setOperationAction(ISD::BITREVERSE, VT, Custom);
1211   }
1212 
1213   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1214     bool HasInt256 = Subtarget.hasInt256();
1215 
1216     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1217                                                      : &X86::VR256RegClass);
1218     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219                                                      : &X86::VR256RegClass);
1220     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1221                                                      : &X86::VR256RegClass);
1222     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1223                                                      : &X86::VR256RegClass);
1224     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1225                                                      : &X86::VR256RegClass);
1226     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1227                                                      : &X86::VR256RegClass);
1228 
1229     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1230       setOperationAction(ISD::FFLOOR,            VT, Legal);
1231       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1232       setOperationAction(ISD::FCEIL,             VT, Legal);
1233       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1234       setOperationAction(ISD::FTRUNC,            VT, Legal);
1235       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1236       setOperationAction(ISD::FRINT,             VT, Legal);
1237       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1238       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1239       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1240       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1241       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1242 
1243       setOperationAction(ISD::FROUND,            VT, Custom);
1244 
1245       setOperationAction(ISD::FNEG,              VT, Custom);
1246       setOperationAction(ISD::FABS,              VT, Custom);
1247       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1248     }
1249 
1250     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1251     // even though v8i16 is a legal type.
1252     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1253     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1254     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1255     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1256     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
1257     setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
1258     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
1259 
1260     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1261     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Legal);
1262 
1263     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1264     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1265     setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1266     setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1267     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1268     setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1269     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1270     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1271     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1272     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
1273     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1274     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1275 
1276     if (!Subtarget.hasAVX512())
1277       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1278 
1279     // In the customized shift lowering, the legal v8i32/v4i64 cases
1280     // in AVX2 will be recognized.
1281     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1282       setOperationAction(ISD::SRL, VT, Custom);
1283       setOperationAction(ISD::SHL, VT, Custom);
1284       setOperationAction(ISD::SRA, VT, Custom);
1285       if (VT == MVT::v4i64) continue;
1286       setOperationAction(ISD::ROTL, VT, Custom);
1287       setOperationAction(ISD::ROTR, VT, Custom);
1288     }
1289 
1290     setOperationAction(ISD::FSHL,       MVT::v32i8, Custom);
1291     setOperationAction(ISD::FSHR,       MVT::v32i8, Custom);
1292     setOperationAction(ISD::FSHL,       MVT::v8i32, Custom);
1293     setOperationAction(ISD::FSHR,       MVT::v8i32, Custom);
1294 
1295     // These types need custom splitting if their input is a 128-bit vector.
1296     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1297     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1298     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1299     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1300 
1301     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1302     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1303     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1304     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1305     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1306     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1307 
1308     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1309       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1310       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1311       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1312     }
1313 
1314     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1315     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1316     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1317     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1318 
1319     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1320       setOperationAction(ISD::SETCC,           VT, Custom);
1321       setOperationAction(ISD::STRICT_FSETCC,   VT, Custom);
1322       setOperationAction(ISD::STRICT_FSETCCS,  VT, Custom);
1323       setOperationAction(ISD::CTPOP,           VT, Custom);
1324       setOperationAction(ISD::CTLZ,            VT, Custom);
1325 
1326       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1327       // setcc all the way to isel and prefer SETGT in some isel patterns.
1328       setCondCodeAction(ISD::SETLT, VT, Custom);
1329       setCondCodeAction(ISD::SETLE, VT, Custom);
1330     }
1331 
1332     if (Subtarget.hasAnyFMA()) {
1333       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1334                        MVT::v2f64, MVT::v4f64 }) {
1335         setOperationAction(ISD::FMA, VT, Legal);
1336         setOperationAction(ISD::STRICT_FMA, VT, Legal);
1337       }
1338     }
1339 
1340     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1341       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1342       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1343     }
1344 
1345     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1346     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1347     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1348     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1349 
1350     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1351     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1352     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1353     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1354     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1355     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1356 
1357     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1358     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1359 
1360     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1361     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1362     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1363     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1364     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1365 
1366     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1367     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1368     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1369     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1370     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1371     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1372     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1373     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1374     setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1375     setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1376     setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1377     setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1378 
1379     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1380       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1381       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1382       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1383       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1384       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1385     }
1386 
1387     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1388       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1389       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1390     }
1391 
1392     if (HasInt256) {
1393       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1394       // when we have a 256bit-wide blend with immediate.
1395       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1396       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1397 
1398       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1399       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1401         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1402         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1403         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1404         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1405         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1406       }
1407     }
1408 
1409     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1410                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1411       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1412       setOperationAction(ISD::MSTORE, VT, Legal);
1413     }
1414 
1415     // Extract subvector is special because the value type
1416     // (result) is 128-bit but the source is 256-bit wide.
1417     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1418                      MVT::v4f32, MVT::v2f64 }) {
1419       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1420     }
1421 
1422     // Custom lower several nodes for 256-bit types.
1423     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1424                     MVT::v8f32, MVT::v4f64 }) {
1425       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1426       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1427       setOperationAction(ISD::VSELECT,            VT, Custom);
1428       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1429       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1430       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1431       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1432       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1433       setOperationAction(ISD::STORE,              VT, Custom);
1434     }
1435 
1436     if (HasInt256) {
1437       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1438 
1439       // Custom legalize 2x32 to get a little better code.
1440       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1441       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1442 
1443       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1444                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1445         setOperationAction(ISD::MGATHER,  VT, Custom);
1446     }
1447   }
1448 
1449   // This block controls legalization of the mask vector sizes that are
1450   // available with AVX512. 512-bit vectors are in a separate block controlled
1451   // by useAVX512Regs.
1452   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1453     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1454     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1455     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1456     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1457     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1458 
1459     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1460     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1461     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1462 
1463     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1464     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1465     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1466     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1467     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1468     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1469     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1470     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1471     setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1472     setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1473     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1474     setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1475 
1476     // There is no byte sized k-register load or store without AVX512DQ.
1477     if (!Subtarget.hasDQI()) {
1478       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1479       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1480       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1481       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1482 
1483       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1484       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1485       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1486       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1487     }
1488 
1489     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1490     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1491       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1492       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1493       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1494     }
1495 
1496     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1497       setOperationAction(ISD::VSELECT,          VT, Expand);
1498 
1499     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1500       setOperationAction(ISD::SETCC,            VT, Custom);
1501       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1502       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1503       setOperationAction(ISD::SELECT,           VT, Custom);
1504       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1505 
1506       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1507       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1508       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1509       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1510       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1511       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1512     }
1513 
1514     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1515       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1516   }
1517 
1518   // This block controls legalization for 512-bit operations with 32/64 bit
1519   // elements. 512-bits can be disabled based on prefer-vector-width and
1520   // required-vector-width function attributes.
1521   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1522     bool HasBWI = Subtarget.hasBWI();
1523 
1524     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1525     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1526     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1527     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1528     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1529     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1530 
1531     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1532       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1533       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1534       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1535       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1536       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1537       if (HasBWI)
1538         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1539     }
1540 
1541     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1542       setOperationAction(ISD::FNEG,  VT, Custom);
1543       setOperationAction(ISD::FABS,  VT, Custom);
1544       setOperationAction(ISD::FMA,   VT, Legal);
1545       setOperationAction(ISD::STRICT_FMA, VT, Legal);
1546       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1547     }
1548 
1549     for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1550       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1551       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1552       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1553       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1554     }
1555     setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Legal);
1556     setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Legal);
1557     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1558     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1559     setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Legal);
1560     setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Legal);
1561     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1562     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1563 
1564     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1565     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1566     setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1567     setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1568     setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1569     setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1570     setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1571     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1572     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1573     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1574     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64,  Legal);
1575     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1576 
1577     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1578     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1579     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1580     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1581     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1582     if (HasBWI)
1583       setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1584 
1585     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1586     // to 512-bit rather than use the AVX2 instructions so that we can use
1587     // k-masks.
1588     if (!Subtarget.hasVLX()) {
1589       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1590            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1591         setOperationAction(ISD::MLOAD,  VT, Custom);
1592         setOperationAction(ISD::MSTORE, VT, Custom);
1593       }
1594     }
1595 
1596     setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1597     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1598     setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1599     setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1600     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1601     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1602     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1603     setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1604     setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1605     setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1606     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1607     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1608     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1609 
1610     if (HasBWI) {
1611       // Extends from v64i1 masks to 512-bit vectors.
1612       setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1613       setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1614       setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1615     }
1616 
1617     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1618       setOperationAction(ISD::FFLOOR,            VT, Legal);
1619       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1620       setOperationAction(ISD::FCEIL,             VT, Legal);
1621       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1622       setOperationAction(ISD::FTRUNC,            VT, Legal);
1623       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1624       setOperationAction(ISD::FRINT,             VT, Legal);
1625       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1626       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1627       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1628       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1629       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1630 
1631       setOperationAction(ISD::FROUND,            VT, Custom);
1632     }
1633 
1634     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1635       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1636       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1637     }
1638 
1639     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1640     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1641     setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1642     setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1643 
1644     setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1645     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1646     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1647     setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1648 
1649     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1650     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1651     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1652     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1653     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1654     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1655 
1656     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1657     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1658 
1659     setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1660 
1661     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1662       setOperationAction(ISD::SRL,              VT, Custom);
1663       setOperationAction(ISD::SHL,              VT, Custom);
1664       setOperationAction(ISD::SRA,              VT, Custom);
1665       setOperationAction(ISD::ROTL,             VT, Custom);
1666       setOperationAction(ISD::ROTR,             VT, Custom);
1667       setOperationAction(ISD::SETCC,            VT, Custom);
1668 
1669       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1670       // setcc all the way to isel and prefer SETGT in some isel patterns.
1671       setCondCodeAction(ISD::SETLT, VT, Custom);
1672       setCondCodeAction(ISD::SETLE, VT, Custom);
1673     }
1674     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1675       setOperationAction(ISD::SMAX,             VT, Legal);
1676       setOperationAction(ISD::UMAX,             VT, Legal);
1677       setOperationAction(ISD::SMIN,             VT, Legal);
1678       setOperationAction(ISD::UMIN,             VT, Legal);
1679       setOperationAction(ISD::ABS,              VT, Legal);
1680       setOperationAction(ISD::CTPOP,            VT, Custom);
1681       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1682       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1683     }
1684 
1685     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1686       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1687       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1688       setOperationAction(ISD::CTLZ,    VT, Custom);
1689       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1690       setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1691       setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1692       setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1693       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1694       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1695       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1696       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1697     }
1698 
1699     setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
1700     setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
1701     setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
1702     setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
1703 
1704     if (Subtarget.hasDQI()) {
1705       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1706       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1707       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1708       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1709       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1710       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1711       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1712       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1713 
1714       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1715     }
1716 
1717     if (Subtarget.hasCDI()) {
1718       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1719       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1720         setOperationAction(ISD::CTLZ,            VT, Legal);
1721       }
1722     } // Subtarget.hasCDI()
1723 
1724     if (Subtarget.hasVPOPCNTDQ()) {
1725       for (auto VT : { MVT::v16i32, MVT::v8i64 })
1726         setOperationAction(ISD::CTPOP, VT, Legal);
1727     }
1728 
1729     // Extract subvector is special because the value type
1730     // (result) is 256-bit but the source is 512-bit wide.
1731     // 128-bit was made Legal under AVX1.
1732     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1733                      MVT::v8f32, MVT::v4f64 })
1734       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1735 
1736     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1737                      MVT::v16f32, MVT::v8f64 }) {
1738       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1739       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1740       setOperationAction(ISD::SELECT,             VT, Custom);
1741       setOperationAction(ISD::VSELECT,            VT, Custom);
1742       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1743       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1744       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1745       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1746       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1747     }
1748 
1749     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1750       setOperationAction(ISD::MLOAD,               VT, Legal);
1751       setOperationAction(ISD::MSTORE,              VT, Legal);
1752       setOperationAction(ISD::MGATHER,             VT, Custom);
1753       setOperationAction(ISD::MSCATTER,            VT, Custom);
1754     }
1755     if (HasBWI) {
1756       for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1757         setOperationAction(ISD::MLOAD,        VT, Legal);
1758         setOperationAction(ISD::MSTORE,       VT, Legal);
1759       }
1760     } else {
1761       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1762       setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1763     }
1764 
1765     if (Subtarget.hasVBMI2()) {
1766       for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1767                        MVT::v16i16, MVT::v8i32, MVT::v4i64,
1768                        MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1769         setOperationAction(ISD::FSHL, VT, Custom);
1770         setOperationAction(ISD::FSHR, VT, Custom);
1771       }
1772 
1773       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1774       setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1775       setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1776       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1777     }
1778   }// useAVX512Regs
1779 
1780   // This block controls legalization for operations that don't have
1781   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1782   // narrower widths.
1783   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1784     // These operations are handled on non-VLX by artificially widening in
1785     // isel patterns.
1786 
1787     setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1788                        Subtarget.hasVLX() ? Legal : Custom);
1789     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1790                        Subtarget.hasVLX() ? Legal : Custom);
1791     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1792                        Subtarget.hasVLX() ? Legal : Custom);
1793     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1794                        Subtarget.hasVLX() ? Legal : Custom);
1795     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
1796     setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1797                        Subtarget.hasVLX() ? Legal : Custom);
1798     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1799                        Subtarget.hasVLX() ? Legal : Custom);
1800     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1801                        Subtarget.hasVLX() ? Legal : Custom);
1802     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1803                        Subtarget.hasVLX() ? Legal : Custom);
1804 
1805     if (Subtarget.hasDQI()) {
1806       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1807       // v2f32 UINT_TO_FP is already custom under SSE2.
1808       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1809              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1810              "Unexpected operation action!");
1811       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1812       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
1813       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
1814       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1815       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1816     }
1817 
1818     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1819       setOperationAction(ISD::SMAX, VT, Legal);
1820       setOperationAction(ISD::UMAX, VT, Legal);
1821       setOperationAction(ISD::SMIN, VT, Legal);
1822       setOperationAction(ISD::UMIN, VT, Legal);
1823       setOperationAction(ISD::ABS,  VT, Legal);
1824     }
1825 
1826     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1827       setOperationAction(ISD::ROTL,     VT, Custom);
1828       setOperationAction(ISD::ROTR,     VT, Custom);
1829     }
1830 
1831     // Custom legalize 2x32 to get a little better code.
1832     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1833     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1834 
1835     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1836                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1837       setOperationAction(ISD::MSCATTER, VT, Custom);
1838 
1839     if (Subtarget.hasDQI()) {
1840       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1841         setOperationAction(ISD::SINT_TO_FP, VT,
1842                            Subtarget.hasVLX() ? Legal : Custom);
1843         setOperationAction(ISD::UINT_TO_FP, VT,
1844                            Subtarget.hasVLX() ? Legal : Custom);
1845         setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1846                            Subtarget.hasVLX() ? Legal : Custom);
1847         setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1848                            Subtarget.hasVLX() ? Legal : Custom);
1849         setOperationAction(ISD::FP_TO_SINT, VT,
1850                            Subtarget.hasVLX() ? Legal : Custom);
1851         setOperationAction(ISD::FP_TO_UINT, VT,
1852                            Subtarget.hasVLX() ? Legal : Custom);
1853         setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1854                            Subtarget.hasVLX() ? Legal : Custom);
1855         setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1856                            Subtarget.hasVLX() ? Legal : Custom);
1857         setOperationAction(ISD::MUL,               VT, Legal);
1858       }
1859     }
1860 
1861     if (Subtarget.hasCDI()) {
1862       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1863         setOperationAction(ISD::CTLZ,            VT, Legal);
1864       }
1865     } // Subtarget.hasCDI()
1866 
1867     if (Subtarget.hasVPOPCNTDQ()) {
1868       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1869         setOperationAction(ISD::CTPOP, VT, Legal);
1870     }
1871   }
1872 
1873   // This block control legalization of v32i1/v64i1 which are available with
1874   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1875   // useBWIRegs.
1876   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1877     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1878     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1879 
1880     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1881       setOperationAction(ISD::VSELECT,            VT, Expand);
1882       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1883       setOperationAction(ISD::SETCC,              VT, Custom);
1884       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1885       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1886       setOperationAction(ISD::SELECT,             VT, Custom);
1887       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1888       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1889       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1890       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1891     }
1892 
1893     for (auto VT : { MVT::v16i1, MVT::v32i1 })
1894       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1895 
1896     // Extends from v32i1 masks to 256-bit vectors.
1897     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1898     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1899     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
1900 
1901     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1902       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1903       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1904     }
1905 
1906     // These operations are handled on non-VLX by artificially widening in
1907     // isel patterns.
1908     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1909 
1910     if (Subtarget.hasBITALG()) {
1911       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1912         setOperationAction(ISD::CTPOP, VT, Legal);
1913     }
1914   }
1915 
1916   if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
1917     auto setGroup = [&] (MVT VT) {
1918       setOperationAction(ISD::FADD,               VT, Legal);
1919       setOperationAction(ISD::STRICT_FADD,        VT, Legal);
1920       setOperationAction(ISD::FSUB,               VT, Legal);
1921       setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
1922       setOperationAction(ISD::FMUL,               VT, Legal);
1923       setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
1924       setOperationAction(ISD::FDIV,               VT, Legal);
1925       setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
1926       setOperationAction(ISD::FSQRT,              VT, Legal);
1927       setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
1928 
1929       setOperationAction(ISD::FFLOOR,             VT, Legal);
1930       setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
1931       setOperationAction(ISD::FCEIL,              VT, Legal);
1932       setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
1933       setOperationAction(ISD::FTRUNC,             VT, Legal);
1934       setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
1935       setOperationAction(ISD::FRINT,              VT, Legal);
1936       setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
1937       setOperationAction(ISD::FNEARBYINT,         VT, Legal);
1938       setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
1939 
1940       setOperationAction(ISD::LOAD,               VT, Legal);
1941       setOperationAction(ISD::STORE,              VT, Legal);
1942 
1943       setOperationAction(ISD::FMA,                VT, Legal);
1944       setOperationAction(ISD::STRICT_FMA,         VT, Legal);
1945       setOperationAction(ISD::VSELECT,            VT, Legal);
1946       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1947       setOperationAction(ISD::SELECT,             VT, Custom);
1948 
1949       setOperationAction(ISD::FNEG,               VT, Custom);
1950       setOperationAction(ISD::FABS,               VT, Custom);
1951       setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
1952       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1953       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1954     };
1955 
1956     // AVX512_FP16 scalar operations
1957     setGroup(MVT::f16);
1958     addRegisterClass(MVT::f16,    &X86::FR16XRegClass);
1959     setOperationAction(ISD::FREM,                 MVT::f16, Promote);
1960     setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);
1961     setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
1962     setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
1963     setOperationAction(ISD::SETCC,                MVT::f16, Custom);
1964     setOperationAction(ISD::STRICT_FSETCC,        MVT::f16, Custom);
1965     setOperationAction(ISD::STRICT_FSETCCS,       MVT::f16, Custom);
1966     setOperationAction(ISD::FROUND,               MVT::f16, Custom);
1967     setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);
1968     setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);
1969     setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
1970     setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
1971     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
1972     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
1973     if (isTypeLegal(MVT::f80)) {
1974       setOperationAction(ISD::FP_EXTEND,          MVT::f80, Custom);
1975       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::f80, Custom);
1976     }
1977 
1978     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
1979     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
1980 
1981     if (Subtarget.useAVX512Regs()) {
1982       setGroup(MVT::v32f16);
1983       addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1984       setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
1985       setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
1986       setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
1987       setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
1988       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
1989       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
1990       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
1991       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
1992 
1993       setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
1994       setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
1995       setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
1996       setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
1997       setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
1998       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
1999                                  MVT::v32i16);
2000       setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
2001       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2002                                  MVT::v32i16);
2003       setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
2004       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2005                                  MVT::v32i16);
2006       setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
2007       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2008                                  MVT::v32i16);
2009 
2010       setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
2011       setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
2012       setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
2013 
2014       setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
2015       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2016 
2017       setOperationAction(ISD::STRICT_FSETCC,      MVT::v32i1, Custom);
2018       setOperationAction(ISD::STRICT_FSETCCS,     MVT::v32i1, Custom);
2019     }
2020 
2021     if (Subtarget.hasVLX()) {
2022       addRegisterClass(MVT::v8f16,  &X86::VR128XRegClass);
2023       addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
2024       setGroup(MVT::v8f16);
2025       setGroup(MVT::v16f16);
2026 
2027       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
2028       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
2029       setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
2030       setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
2031       setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
2032       setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
2033       setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
2034       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
2035       setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
2036       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
2037 
2038       setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
2039       setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
2040       setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
2041       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
2042       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
2043       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
2044 
2045       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2046       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
2047       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
2048 
2049       setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
2050       setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
2051       setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
2052 
2053       setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2054       setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2055       setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2056       setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2057 
2058       // Need to custom widen these to prevent scalarization.
2059       setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
2060       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2061     }
2062 
2063     // Support fp16 0 immediate
2064     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
2065   }
2066 
2067   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2068     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
2069     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2070     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2071     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
2072     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2073 
2074     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
2075     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2076     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2077     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
2078     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2079 
2080     if (Subtarget.hasBWI()) {
2081       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
2082       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
2083     }
2084 
2085     if (Subtarget.hasFP16()) {
2086       // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2087       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
2088       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2089       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
2090       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2091       setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
2092       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2093       setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
2094       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2095       // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2096       setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
2097       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2098       setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
2099       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2100       setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
2101       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2102       setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
2103       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2104       // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2105       setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
2106       setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
2107       setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
2108       setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
2109       // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2110       setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
2111       setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
2112       setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
2113       setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
2114     }
2115 
2116     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2117     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2118     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2119   }
2120 
2121   if (Subtarget.hasAMXTILE()) {
2122     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2123   }
2124 
2125   // We want to custom lower some of our intrinsics.
2126   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2127   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2128   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2129   if (!Subtarget.is64Bit()) {
2130     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2131   }
2132 
2133   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2134   // handle type legalization for these operations here.
2135   //
2136   // FIXME: We really should do custom legalization for addition and
2137   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
2138   // than generic legalization for 64-bit multiplication-with-overflow, though.
2139   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2140     if (VT == MVT::i64 && !Subtarget.is64Bit())
2141       continue;
2142     // Add/Sub/Mul with overflow operations are custom lowered.
2143     setOperationAction(ISD::SADDO, VT, Custom);
2144     setOperationAction(ISD::UADDO, VT, Custom);
2145     setOperationAction(ISD::SSUBO, VT, Custom);
2146     setOperationAction(ISD::USUBO, VT, Custom);
2147     setOperationAction(ISD::SMULO, VT, Custom);
2148     setOperationAction(ISD::UMULO, VT, Custom);
2149 
2150     // Support carry in as value rather than glue.
2151     setOperationAction(ISD::ADDCARRY, VT, Custom);
2152     setOperationAction(ISD::SUBCARRY, VT, Custom);
2153     setOperationAction(ISD::SETCCCARRY, VT, Custom);
2154     setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2155     setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2156   }
2157 
2158   if (!Subtarget.is64Bit()) {
2159     // These libcalls are not available in 32-bit.
2160     setLibcallName(RTLIB::SHL_I128, nullptr);
2161     setLibcallName(RTLIB::SRL_I128, nullptr);
2162     setLibcallName(RTLIB::SRA_I128, nullptr);
2163     setLibcallName(RTLIB::MUL_I128, nullptr);
2164     // The MULO libcall is not part of libgcc, only compiler-rt.
2165     setLibcallName(RTLIB::MULO_I64, nullptr);
2166   }
2167   // The MULO libcall is not part of libgcc, only compiler-rt.
2168   setLibcallName(RTLIB::MULO_I128, nullptr);
2169 
2170   // Combine sin / cos into _sincos_stret if it is available.
2171   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2172       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2173     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2174     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2175   }
2176 
2177   if (Subtarget.isTargetWin64()) {
2178     setOperationAction(ISD::SDIV, MVT::i128, Custom);
2179     setOperationAction(ISD::UDIV, MVT::i128, Custom);
2180     setOperationAction(ISD::SREM, MVT::i128, Custom);
2181     setOperationAction(ISD::UREM, MVT::i128, Custom);
2182     setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2183     setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2184     setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2185     setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2186     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2187     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2188     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2189     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2190   }
2191 
2192   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2193   // is. We should promote the value to 64-bits to solve this.
2194   // This is what the CRT headers do - `fmodf` is an inline header
2195   // function casting to f64 and calling `fmod`.
2196   if (Subtarget.is32Bit() &&
2197       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2198     for (ISD::NodeType Op :
2199          {ISD::FCEIL,  ISD::STRICT_FCEIL,
2200           ISD::FCOS,   ISD::STRICT_FCOS,
2201           ISD::FEXP,   ISD::STRICT_FEXP,
2202           ISD::FFLOOR, ISD::STRICT_FFLOOR,
2203           ISD::FREM,   ISD::STRICT_FREM,
2204           ISD::FLOG,   ISD::STRICT_FLOG,
2205           ISD::FLOG10, ISD::STRICT_FLOG10,
2206           ISD::FPOW,   ISD::STRICT_FPOW,
2207           ISD::FSIN,   ISD::STRICT_FSIN})
2208       if (isOperationExpand(Op, MVT::f32))
2209         setOperationAction(Op, MVT::f32, Promote);
2210 
2211   // We have target-specific dag combine patterns for the following nodes:
2212   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2213   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2214   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2215   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2216   setTargetDAGCombine(ISD::CONCAT_VECTORS);
2217   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2218   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2219   setTargetDAGCombine(ISD::BITCAST);
2220   setTargetDAGCombine(ISD::VSELECT);
2221   setTargetDAGCombine(ISD::SELECT);
2222   setTargetDAGCombine(ISD::SHL);
2223   setTargetDAGCombine(ISD::SRA);
2224   setTargetDAGCombine(ISD::SRL);
2225   setTargetDAGCombine(ISD::OR);
2226   setTargetDAGCombine(ISD::AND);
2227   setTargetDAGCombine(ISD::ADD);
2228   setTargetDAGCombine(ISD::FADD);
2229   setTargetDAGCombine(ISD::FSUB);
2230   setTargetDAGCombine(ISD::FNEG);
2231   setTargetDAGCombine(ISD::FMA);
2232   setTargetDAGCombine(ISD::STRICT_FMA);
2233   setTargetDAGCombine(ISD::FMINNUM);
2234   setTargetDAGCombine(ISD::FMAXNUM);
2235   setTargetDAGCombine(ISD::SUB);
2236   setTargetDAGCombine(ISD::LOAD);
2237   setTargetDAGCombine(ISD::MLOAD);
2238   setTargetDAGCombine(ISD::STORE);
2239   setTargetDAGCombine(ISD::MSTORE);
2240   setTargetDAGCombine(ISD::TRUNCATE);
2241   setTargetDAGCombine(ISD::ZERO_EXTEND);
2242   setTargetDAGCombine(ISD::ANY_EXTEND);
2243   setTargetDAGCombine(ISD::SIGN_EXTEND);
2244   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2245   setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2246   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2247   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2248   setTargetDAGCombine(ISD::SINT_TO_FP);
2249   setTargetDAGCombine(ISD::UINT_TO_FP);
2250   setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2251   setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2252   setTargetDAGCombine(ISD::SETCC);
2253   setTargetDAGCombine(ISD::MUL);
2254   setTargetDAGCombine(ISD::XOR);
2255   setTargetDAGCombine(ISD::MSCATTER);
2256   setTargetDAGCombine(ISD::MGATHER);
2257   setTargetDAGCombine(ISD::FP16_TO_FP);
2258   setTargetDAGCombine(ISD::FP_EXTEND);
2259   setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2260   setTargetDAGCombine(ISD::FP_ROUND);
2261 
2262   computeRegisterProperties(Subtarget.getRegisterInfo());
2263 
2264   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2265   MaxStoresPerMemsetOptSize = 8;
2266   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2267   MaxStoresPerMemcpyOptSize = 4;
2268   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2269   MaxStoresPerMemmoveOptSize = 4;
2270 
2271   // TODO: These control memcmp expansion in CGP and could be raised higher, but
2272   // that needs to benchmarked and balanced with the potential use of vector
2273   // load/store types (PR33329, PR33914).
2274   MaxLoadsPerMemcmp = 2;
2275   MaxLoadsPerMemcmpOptSize = 2;
2276 
2277   // Default loop alignment, which can be overridden by -align-loops.
2278   setPrefLoopAlignment(Align(16));
2279 
2280   // An out-of-order CPU can speculatively execute past a predictable branch,
2281   // but a conditional move could be stalled by an expensive earlier operation.
2282   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2283   EnableExtLdPromotion = true;
2284   setPrefFunctionAlignment(Align(16));
2285 
2286   verifyIntrinsicTables();
2287 
2288   // Default to having -disable-strictnode-mutation on
2289   IsStrictFPEnabled = true;
2290 }
2291 
2292 // This has so far only been implemented for 64-bit MachO.
2293 bool X86TargetLowering::useLoadStackGuardNode() const {
2294   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2295 }
2296 
2297 bool X86TargetLowering::useStackGuardXorFP() const {
2298   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2299   return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2300 }
2301 
2302 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2303                                                const SDLoc &DL) const {
2304   EVT PtrTy = getPointerTy(DAG.getDataLayout());
2305   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2306   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2307   return SDValue(Node, 0);
2308 }
2309 
2310 TargetLoweringBase::LegalizeTypeAction
2311 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2312   if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2313       !Subtarget.hasBWI())
2314     return TypeSplitVector;
2315 
2316   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2317       VT.getVectorElementType() != MVT::i1)
2318     return TypeWidenVector;
2319 
2320   return TargetLoweringBase::getPreferredVectorAction(VT);
2321 }
2322 
2323 static std::pair<MVT, unsigned>
2324 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2325                                  const X86Subtarget &Subtarget) {
2326   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2327   // convention is one that uses k registers.
2328   if (NumElts == 2)
2329     return {MVT::v2i64, 1};
2330   if (NumElts == 4)
2331     return {MVT::v4i32, 1};
2332   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2333       CC != CallingConv::Intel_OCL_BI)
2334     return {MVT::v8i16, 1};
2335   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2336       CC != CallingConv::Intel_OCL_BI)
2337     return {MVT::v16i8, 1};
2338   // v32i1 passes in ymm unless we have BWI and the calling convention is
2339   // regcall.
2340   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2341     return {MVT::v32i8, 1};
2342   // Split v64i1 vectors if we don't have v64i8 available.
2343   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2344     if (Subtarget.useAVX512Regs())
2345       return {MVT::v64i8, 1};
2346     return {MVT::v32i8, 2};
2347   }
2348 
2349   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2350   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2351       NumElts > 64)
2352     return {MVT::i8, NumElts};
2353 
2354   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2355 }
2356 
2357 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2358                                                      CallingConv::ID CC,
2359                                                      EVT VT) const {
2360   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2361       Subtarget.hasAVX512()) {
2362     unsigned NumElts = VT.getVectorNumElements();
2363 
2364     MVT RegisterVT;
2365     unsigned NumRegisters;
2366     std::tie(RegisterVT, NumRegisters) =
2367         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2368     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2369       return RegisterVT;
2370   }
2371 
2372   // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2373   // So its default register type is f16. We override the type to v8f16 here.
2374   if (VT == MVT::v3f16 && Subtarget.hasFP16())
2375     return MVT::v8f16;
2376 
2377   // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2378   if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2379       !Subtarget.hasX87())
2380     return MVT::i32;
2381 
2382   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2383 }
2384 
2385 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2386                                                           CallingConv::ID CC,
2387                                                           EVT VT) const {
2388   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2389       Subtarget.hasAVX512()) {
2390     unsigned NumElts = VT.getVectorNumElements();
2391 
2392     MVT RegisterVT;
2393     unsigned NumRegisters;
2394     std::tie(RegisterVT, NumRegisters) =
2395         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2396     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2397       return NumRegisters;
2398   }
2399 
2400   // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2401   // So its default register number is 3. We override the number to 1 here.
2402   if (VT == MVT::v3f16 && Subtarget.hasFP16())
2403     return 1;
2404 
2405   // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2406   // x87 is disabled.
2407   if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2408     if (VT == MVT::f64)
2409       return 2;
2410     if (VT == MVT::f80)
2411       return 3;
2412   }
2413 
2414   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2415 }
2416 
2417 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2418     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2419     unsigned &NumIntermediates, MVT &RegisterVT) const {
2420   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2421   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2422       Subtarget.hasAVX512() &&
2423       (!isPowerOf2_32(VT.getVectorNumElements()) ||
2424        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2425        VT.getVectorNumElements() > 64)) {
2426     RegisterVT = MVT::i8;
2427     IntermediateVT = MVT::i1;
2428     NumIntermediates = VT.getVectorNumElements();
2429     return NumIntermediates;
2430   }
2431 
2432   // Split v64i1 vectors if we don't have v64i8 available.
2433   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2434       CC != CallingConv::X86_RegCall) {
2435     RegisterVT = MVT::v32i8;
2436     IntermediateVT = MVT::v32i1;
2437     NumIntermediates = 2;
2438     return 2;
2439   }
2440 
2441   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2442                                               NumIntermediates, RegisterVT);
2443 }
2444 
2445 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2446                                           LLVMContext& Context,
2447                                           EVT VT) const {
2448   if (!VT.isVector())
2449     return MVT::i8;
2450 
2451   if (Subtarget.hasAVX512()) {
2452     // Figure out what this type will be legalized to.
2453     EVT LegalVT = VT;
2454     while (getTypeAction(Context, LegalVT) != TypeLegal)
2455       LegalVT = getTypeToTransformTo(Context, LegalVT);
2456 
2457     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2458     if (LegalVT.getSimpleVT().is512BitVector())
2459       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2460 
2461     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2462       // If we legalized to less than a 512-bit vector, then we will use a vXi1
2463       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2464       // vXi16/vXi8.
2465       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2466       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2467         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2468     }
2469   }
2470 
2471   return VT.changeVectorElementTypeToInteger();
2472 }
2473 
2474 /// Helper for getByValTypeAlignment to determine
2475 /// the desired ByVal argument alignment.
2476 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2477   if (MaxAlign == 16)
2478     return;
2479   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2480     if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2481       MaxAlign = Align(16);
2482   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2483     Align EltAlign;
2484     getMaxByValAlign(ATy->getElementType(), EltAlign);
2485     if (EltAlign > MaxAlign)
2486       MaxAlign = EltAlign;
2487   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2488     for (auto *EltTy : STy->elements()) {
2489       Align EltAlign;
2490       getMaxByValAlign(EltTy, EltAlign);
2491       if (EltAlign > MaxAlign)
2492         MaxAlign = EltAlign;
2493       if (MaxAlign == 16)
2494         break;
2495     }
2496   }
2497 }
2498 
2499 /// Return the desired alignment for ByVal aggregate
2500 /// function arguments in the caller parameter area. For X86, aggregates
2501 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2502 /// are at 4-byte boundaries.
2503 uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2504                                                   const DataLayout &DL) const {
2505   if (Subtarget.is64Bit()) {
2506     // Max of 8 and alignment of type.
2507     Align TyAlign = DL.getABITypeAlign(Ty);
2508     if (TyAlign > 8)
2509       return TyAlign.value();
2510     return 8;
2511   }
2512 
2513   Align Alignment(4);
2514   if (Subtarget.hasSSE1())
2515     getMaxByValAlign(Ty, Alignment);
2516   return Alignment.value();
2517 }
2518 
2519 /// It returns EVT::Other if the type should be determined using generic
2520 /// target-independent logic.
2521 /// For vector ops we check that the overall size isn't larger than our
2522 /// preferred vector width.
2523 EVT X86TargetLowering::getOptimalMemOpType(
2524     const MemOp &Op, const AttributeList &FuncAttributes) const {
2525   if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2526     if (Op.size() >= 16 &&
2527         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2528       // FIXME: Check if unaligned 64-byte accesses are slow.
2529       if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2530           (Subtarget.getPreferVectorWidth() >= 512)) {
2531         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2532       }
2533       // FIXME: Check if unaligned 32-byte accesses are slow.
2534       if (Op.size() >= 32 && Subtarget.hasAVX() &&
2535           (Subtarget.getPreferVectorWidth() >= 256)) {
2536         // Although this isn't a well-supported type for AVX1, we'll let
2537         // legalization and shuffle lowering produce the optimal codegen. If we
2538         // choose an optimal type with a vector element larger than a byte,
2539         // getMemsetStores() may create an intermediate splat (using an integer
2540         // multiply) before we splat as a vector.
2541         return MVT::v32i8;
2542       }
2543       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2544         return MVT::v16i8;
2545       // TODO: Can SSE1 handle a byte vector?
2546       // If we have SSE1 registers we should be able to use them.
2547       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2548           (Subtarget.getPreferVectorWidth() >= 128))
2549         return MVT::v4f32;
2550     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2551                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2552       // Do not use f64 to lower memcpy if source is string constant. It's
2553       // better to use i32 to avoid the loads.
2554       // Also, do not use f64 to lower memset unless this is a memset of zeros.
2555       // The gymnastics of splatting a byte value into an XMM register and then
2556       // only using 8-byte stores (because this is a CPU with slow unaligned
2557       // 16-byte accesses) makes that a loser.
2558       return MVT::f64;
2559     }
2560   }
2561   // This is a compromise. If we reach here, unaligned accesses may be slow on
2562   // this target. However, creating smaller, aligned accesses could be even
2563   // slower and would certainly be a lot more code.
2564   if (Subtarget.is64Bit() && Op.size() >= 8)
2565     return MVT::i64;
2566   return MVT::i32;
2567 }
2568 
2569 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2570   if (VT == MVT::f32)
2571     return X86ScalarSSEf32;
2572   if (VT == MVT::f64)
2573     return X86ScalarSSEf64;
2574   return true;
2575 }
2576 
2577 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2578     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2579     bool *Fast) const {
2580   if (Fast) {
2581     switch (VT.getSizeInBits()) {
2582     default:
2583       // 8-byte and under are always assumed to be fast.
2584       *Fast = true;
2585       break;
2586     case 128:
2587       *Fast = !Subtarget.isUnalignedMem16Slow();
2588       break;
2589     case 256:
2590       *Fast = !Subtarget.isUnalignedMem32Slow();
2591       break;
2592     // TODO: What about AVX-512 (512-bit) accesses?
2593     }
2594   }
2595   // NonTemporal vector memory ops must be aligned.
2596   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2597     // NT loads can only be vector aligned, so if its less aligned than the
2598     // minimum vector size (which we can split the vector down to), we might as
2599     // well use a regular unaligned vector load.
2600     // We don't have any NT loads pre-SSE41.
2601     if (!!(Flags & MachineMemOperand::MOLoad))
2602       return (Alignment < 16 || !Subtarget.hasSSE41());
2603     return false;
2604   }
2605   // Misaligned accesses of any size are always allowed.
2606   return true;
2607 }
2608 
2609 /// Return the entry encoding for a jump table in the
2610 /// current function.  The returned value is a member of the
2611 /// MachineJumpTableInfo::JTEntryKind enum.
2612 unsigned X86TargetLowering::getJumpTableEncoding() const {
2613   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2614   // symbol.
2615   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2616     return MachineJumpTableInfo::EK_Custom32;
2617 
2618   // Otherwise, use the normal jump table encoding heuristics.
2619   return TargetLowering::getJumpTableEncoding();
2620 }
2621 
2622 bool X86TargetLowering::useSoftFloat() const {
2623   return Subtarget.useSoftFloat();
2624 }
2625 
2626 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2627                                               ArgListTy &Args) const {
2628 
2629   // Only relabel X86-32 for C / Stdcall CCs.
2630   if (Subtarget.is64Bit())
2631     return;
2632   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2633     return;
2634   unsigned ParamRegs = 0;
2635   if (auto *M = MF->getFunction().getParent())
2636     ParamRegs = M->getNumberRegisterParameters();
2637 
2638   // Mark the first N int arguments as having reg
2639   for (auto &Arg : Args) {
2640     Type *T = Arg.Ty;
2641     if (T->isIntOrPtrTy())
2642       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2643         unsigned numRegs = 1;
2644         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2645           numRegs = 2;
2646         if (ParamRegs < numRegs)
2647           return;
2648         ParamRegs -= numRegs;
2649         Arg.IsInReg = true;
2650       }
2651   }
2652 }
2653 
2654 const MCExpr *
2655 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2656                                              const MachineBasicBlock *MBB,
2657                                              unsigned uid,MCContext &Ctx) const{
2658   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2659   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2660   // entries.
2661   return MCSymbolRefExpr::create(MBB->getSymbol(),
2662                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
2663 }
2664 
2665 /// Returns relocation base for the given PIC jumptable.
2666 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2667                                                     SelectionDAG &DAG) const {
2668   if (!Subtarget.is64Bit())
2669     // This doesn't have SDLoc associated with it, but is not really the
2670     // same as a Register.
2671     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2672                        getPointerTy(DAG.getDataLayout()));
2673   return Table;
2674 }
2675 
2676 /// This returns the relocation base for the given PIC jumptable,
2677 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2678 const MCExpr *X86TargetLowering::
2679 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2680                              MCContext &Ctx) const {
2681   // X86-64 uses RIP relative addressing based on the jump table label.
2682   if (Subtarget.isPICStyleRIPRel())
2683     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2684 
2685   // Otherwise, the reference is relative to the PIC base.
2686   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2687 }
2688 
2689 std::pair<const TargetRegisterClass *, uint8_t>
2690 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2691                                            MVT VT) const {
2692   const TargetRegisterClass *RRC = nullptr;
2693   uint8_t Cost = 1;
2694   switch (VT.SimpleTy) {
2695   default:
2696     return TargetLowering::findRepresentativeClass(TRI, VT);
2697   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2698     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2699     break;
2700   case MVT::x86mmx:
2701     RRC = &X86::VR64RegClass;
2702     break;
2703   case MVT::f32: case MVT::f64:
2704   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2705   case MVT::v4f32: case MVT::v2f64:
2706   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2707   case MVT::v8f32: case MVT::v4f64:
2708   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2709   case MVT::v16f32: case MVT::v8f64:
2710     RRC = &X86::VR128XRegClass;
2711     break;
2712   }
2713   return std::make_pair(RRC, Cost);
2714 }
2715 
2716 unsigned X86TargetLowering::getAddressSpace() const {
2717   if (Subtarget.is64Bit())
2718     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2719   return 256;
2720 }
2721 
2722 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2723   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2724          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2725 }
2726 
2727 static Constant* SegmentOffset(IRBuilderBase &IRB,
2728                                int Offset, unsigned AddressSpace) {
2729   return ConstantExpr::getIntToPtr(
2730       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2731       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2732 }
2733 
2734 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2735   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2736   // tcbhead_t; use it instead of the usual global variable (see
2737   // sysdeps/{i386,x86_64}/nptl/tls.h)
2738   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2739     if (Subtarget.isTargetFuchsia()) {
2740       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2741       return SegmentOffset(IRB, 0x10, getAddressSpace());
2742     } else {
2743       unsigned AddressSpace = getAddressSpace();
2744       Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2745       // Specially, some users may customize the base reg and offset.
2746       int Offset = M->getStackProtectorGuardOffset();
2747       // If we don't set -stack-protector-guard-offset value:
2748       // %fs:0x28, unless we're using a Kernel code model, in which case
2749       // it's %gs:0x28.  gs:0x14 on i386.
2750       if (Offset == INT_MAX)
2751         Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2752 
2753       StringRef GuardReg = M->getStackProtectorGuardReg();
2754       if (GuardReg == "fs")
2755         AddressSpace = X86AS::FS;
2756       else if (GuardReg == "gs")
2757         AddressSpace = X86AS::GS;
2758       return SegmentOffset(IRB, Offset, AddressSpace);
2759     }
2760   }
2761   return TargetLowering::getIRStackGuard(IRB);
2762 }
2763 
2764 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2765   // MSVC CRT provides functionalities for stack protection.
2766   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2767       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2768     // MSVC CRT has a global variable holding security cookie.
2769     M.getOrInsertGlobal("__security_cookie",
2770                         Type::getInt8PtrTy(M.getContext()));
2771 
2772     // MSVC CRT has a function to validate security cookie.
2773     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2774         "__security_check_cookie", Type::getVoidTy(M.getContext()),
2775         Type::getInt8PtrTy(M.getContext()));
2776     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2777       F->setCallingConv(CallingConv::X86_FastCall);
2778       F->addParamAttr(0, Attribute::AttrKind::InReg);
2779     }
2780     return;
2781   }
2782 
2783   StringRef GuardMode = M.getStackProtectorGuard();
2784 
2785   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2786   if ((GuardMode == "tls" || GuardMode.empty()) &&
2787       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2788     return;
2789   TargetLowering::insertSSPDeclarations(M);
2790 }
2791 
2792 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2793   // MSVC CRT has a global variable holding security cookie.
2794   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2795       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2796     return M.getGlobalVariable("__security_cookie");
2797   }
2798   return TargetLowering::getSDagStackGuard(M);
2799 }
2800 
2801 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2802   // MSVC CRT has a function to validate security cookie.
2803   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2804       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2805     return M.getFunction("__security_check_cookie");
2806   }
2807   return TargetLowering::getSSPStackGuardCheck(M);
2808 }
2809 
2810 Value *
2811 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2812   if (Subtarget.getTargetTriple().isOSContiki())
2813     return getDefaultSafeStackPointerLocation(IRB, false);
2814 
2815   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2816   // definition of TLS_SLOT_SAFESTACK in
2817   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2818   if (Subtarget.isTargetAndroid()) {
2819     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2820     // %gs:0x24 on i386
2821     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2822     return SegmentOffset(IRB, Offset, getAddressSpace());
2823   }
2824 
2825   // Fuchsia is similar.
2826   if (Subtarget.isTargetFuchsia()) {
2827     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2828     return SegmentOffset(IRB, 0x18, getAddressSpace());
2829   }
2830 
2831   return TargetLowering::getSafeStackPointerLocation(IRB);
2832 }
2833 
2834 //===----------------------------------------------------------------------===//
2835 //               Return Value Calling Convention Implementation
2836 //===----------------------------------------------------------------------===//
2837 
2838 bool X86TargetLowering::CanLowerReturn(
2839     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2840     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2841   SmallVector<CCValAssign, 16> RVLocs;
2842   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2843   return CCInfo.CheckReturn(Outs, RetCC_X86);
2844 }
2845 
2846 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2847   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2848   return ScratchRegs;
2849 }
2850 
2851 /// Lowers masks values (v*i1) to the local register values
2852 /// \returns DAG node after lowering to register type
2853 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2854                                const SDLoc &Dl, SelectionDAG &DAG) {
2855   EVT ValVT = ValArg.getValueType();
2856 
2857   if (ValVT == MVT::v1i1)
2858     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2859                        DAG.getIntPtrConstant(0, Dl));
2860 
2861   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2862       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2863     // Two stage lowering might be required
2864     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2865     // anyextend: i8   -> i32 / i16   -> i32
2866     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2867     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2868     if (ValLoc == MVT::i32)
2869       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2870     return ValToCopy;
2871   }
2872 
2873   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2874       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2875     // One stage lowering is required
2876     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2877     return DAG.getBitcast(ValLoc, ValArg);
2878   }
2879 
2880   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2881 }
2882 
2883 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2884 static void Passv64i1ArgInRegs(
2885     const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2886     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2887     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2888   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2889   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2890   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2891   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2892          "The value should reside in two registers");
2893 
2894   // Before splitting the value we cast it to i64
2895   Arg = DAG.getBitcast(MVT::i64, Arg);
2896 
2897   // Splitting the value into two i32 types
2898   SDValue Lo, Hi;
2899   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2900                    DAG.getConstant(0, Dl, MVT::i32));
2901   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2902                    DAG.getConstant(1, Dl, MVT::i32));
2903 
2904   // Attach the two i32 types into corresponding registers
2905   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2906   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2907 }
2908 
2909 SDValue
2910 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2911                                bool isVarArg,
2912                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2913                                const SmallVectorImpl<SDValue> &OutVals,
2914                                const SDLoc &dl, SelectionDAG &DAG) const {
2915   MachineFunction &MF = DAG.getMachineFunction();
2916   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2917 
2918   // In some cases we need to disable registers from the default CSR list.
2919   // For example, when they are used for argument passing.
2920   bool ShouldDisableCalleeSavedRegister =
2921       CallConv == CallingConv::X86_RegCall ||
2922       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2923 
2924   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2925     report_fatal_error("X86 interrupts may not return any value");
2926 
2927   SmallVector<CCValAssign, 16> RVLocs;
2928   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2929   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2930 
2931   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2932   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2933        ++I, ++OutsIndex) {
2934     CCValAssign &VA = RVLocs[I];
2935     assert(VA.isRegLoc() && "Can only return in registers!");
2936 
2937     // Add the register to the CalleeSaveDisableRegs list.
2938     if (ShouldDisableCalleeSavedRegister)
2939       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2940 
2941     SDValue ValToCopy = OutVals[OutsIndex];
2942     EVT ValVT = ValToCopy.getValueType();
2943 
2944     // Promote values to the appropriate types.
2945     if (VA.getLocInfo() == CCValAssign::SExt)
2946       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2947     else if (VA.getLocInfo() == CCValAssign::ZExt)
2948       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2949     else if (VA.getLocInfo() == CCValAssign::AExt) {
2950       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2951         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2952       else
2953         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2954     }
2955     else if (VA.getLocInfo() == CCValAssign::BCvt)
2956       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2957 
2958     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2959            "Unexpected FP-extend for return value.");
2960 
2961     // Report an error if we have attempted to return a value via an XMM
2962     // register and SSE was disabled.
2963     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2964       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2965       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2966     } else if (!Subtarget.hasSSE2() &&
2967                X86::FR64XRegClass.contains(VA.getLocReg()) &&
2968                ValVT == MVT::f64) {
2969       // When returning a double via an XMM register, report an error if SSE2 is
2970       // not enabled.
2971       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2972       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2973     }
2974 
2975     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2976     // the RET instruction and handled by the FP Stackifier.
2977     if (VA.getLocReg() == X86::FP0 ||
2978         VA.getLocReg() == X86::FP1) {
2979       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2980       // change the value to the FP stack register class.
2981       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2982         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2983       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2984       // Don't emit a copytoreg.
2985       continue;
2986     }
2987 
2988     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2989     // which is returned in RAX / RDX.
2990     if (Subtarget.is64Bit()) {
2991       if (ValVT == MVT::x86mmx) {
2992         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2993           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2994           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2995                                   ValToCopy);
2996           // If we don't have SSE2 available, convert to v4f32 so the generated
2997           // register is legal.
2998           if (!Subtarget.hasSSE2())
2999             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3000         }
3001       }
3002     }
3003 
3004     if (VA.needsCustom()) {
3005       assert(VA.getValVT() == MVT::v64i1 &&
3006              "Currently the only custom case is when we split v64i1 to 2 regs");
3007 
3008       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3009                          Subtarget);
3010 
3011       // Add the second register to the CalleeSaveDisableRegs list.
3012       if (ShouldDisableCalleeSavedRegister)
3013         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3014     } else {
3015       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3016     }
3017   }
3018 
3019   SDValue Flag;
3020   SmallVector<SDValue, 6> RetOps;
3021   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3022   // Operand #1 = Bytes To Pop
3023   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3024                    MVT::i32));
3025 
3026   // Copy the result values into the output registers.
3027   for (auto &RetVal : RetVals) {
3028     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3029       RetOps.push_back(RetVal.second);
3030       continue; // Don't emit a copytoreg.
3031     }
3032 
3033     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
3034     Flag = Chain.getValue(1);
3035     RetOps.push_back(
3036         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3037   }
3038 
3039   // Swift calling convention does not require we copy the sret argument
3040   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3041 
3042   // All x86 ABIs require that for returning structs by value we copy
3043   // the sret argument into %rax/%eax (depending on ABI) for the return.
3044   // We saved the argument into a virtual register in the entry block,
3045   // so now we copy the value out and into %rax/%eax.
3046   //
3047   // Checking Function.hasStructRetAttr() here is insufficient because the IR
3048   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3049   // false, then an sret argument may be implicitly inserted in the SelDAG. In
3050   // either case FuncInfo->setSRetReturnReg() will have been called.
3051   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3052     // When we have both sret and another return value, we should use the
3053     // original Chain stored in RetOps[0], instead of the current Chain updated
3054     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3055 
3056     // For the case of sret and another return value, we have
3057     //   Chain_0 at the function entry
3058     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
3059     // If we use Chain_1 in getCopyFromReg, we will have
3060     //   Val = getCopyFromReg(Chain_1)
3061     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
3062 
3063     // getCopyToReg(Chain_0) will be glued together with
3064     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3065     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3066     //   Data dependency from Unit B to Unit A due to usage of Val in
3067     //     getCopyToReg(Chain_1, Val)
3068     //   Chain dependency from Unit A to Unit B
3069 
3070     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3071     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3072                                      getPointerTy(MF.getDataLayout()));
3073 
3074     Register RetValReg
3075         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3076           X86::RAX : X86::EAX;
3077     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
3078     Flag = Chain.getValue(1);
3079 
3080     // RAX/EAX now acts like a return value.
3081     RetOps.push_back(
3082         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3083 
3084     // Add the returned register to the CalleeSaveDisableRegs list.
3085     if (ShouldDisableCalleeSavedRegister)
3086       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3087   }
3088 
3089   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3090   const MCPhysReg *I =
3091       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3092   if (I) {
3093     for (; *I; ++I) {
3094       if (X86::GR64RegClass.contains(*I))
3095         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3096       else
3097         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3098     }
3099   }
3100 
3101   RetOps[0] = Chain;  // Update chain.
3102 
3103   // Add the flag if we have it.
3104   if (Flag.getNode())
3105     RetOps.push_back(Flag);
3106 
3107   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
3108   if (CallConv == CallingConv::X86_INTR)
3109     opcode = X86ISD::IRET;
3110   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3111 }
3112 
3113 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3114   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3115     return false;
3116 
3117   SDValue TCChain = Chain;
3118   SDNode *Copy = *N->use_begin();
3119   if (Copy->getOpcode() == ISD::CopyToReg) {
3120     // If the copy has a glue operand, we conservatively assume it isn't safe to
3121     // perform a tail call.
3122     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3123       return false;
3124     TCChain = Copy->getOperand(0);
3125   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3126     return false;
3127 
3128   bool HasRet = false;
3129   for (const SDNode *U : Copy->uses()) {
3130     if (U->getOpcode() != X86ISD::RET_FLAG)
3131       return false;
3132     // If we are returning more than one value, we can definitely
3133     // not make a tail call see PR19530
3134     if (U->getNumOperands() > 4)
3135       return false;
3136     if (U->getNumOperands() == 4 &&
3137         U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3138       return false;
3139     HasRet = true;
3140   }
3141 
3142   if (!HasRet)
3143     return false;
3144 
3145   Chain = TCChain;
3146   return true;
3147 }
3148 
3149 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3150                                            ISD::NodeType ExtendKind) const {
3151   MVT ReturnMVT = MVT::i32;
3152 
3153   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3154   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3155     // The ABI does not require i1, i8 or i16 to be extended.
3156     //
3157     // On Darwin, there is code in the wild relying on Clang's old behaviour of
3158     // always extending i8/i16 return values, so keep doing that for now.
3159     // (PR26665).
3160     ReturnMVT = MVT::i8;
3161   }
3162 
3163   EVT MinVT = getRegisterType(Context, ReturnMVT);
3164   return VT.bitsLT(MinVT) ? MinVT : VT;
3165 }
3166 
3167 /// Reads two 32 bit registers and creates a 64 bit mask value.
3168 /// \param VA The current 32 bit value that need to be assigned.
3169 /// \param NextVA The next 32 bit value that need to be assigned.
3170 /// \param Root The parent DAG node.
3171 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
3172 ///                        glue purposes. In the case the DAG is already using
3173 ///                        physical register instead of virtual, we should glue
3174 ///                        our new SDValue to InFlag SDvalue.
3175 /// \return a new SDvalue of size 64bit.
3176 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3177                                 SDValue &Root, SelectionDAG &DAG,
3178                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
3179                                 SDValue *InFlag = nullptr) {
3180   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
3181   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
3182   assert(VA.getValVT() == MVT::v64i1 &&
3183          "Expecting first location of 64 bit width type");
3184   assert(NextVA.getValVT() == VA.getValVT() &&
3185          "The locations should have the same type");
3186   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
3187          "The values should reside in two registers");
3188 
3189   SDValue Lo, Hi;
3190   SDValue ArgValueLo, ArgValueHi;
3191 
3192   MachineFunction &MF = DAG.getMachineFunction();
3193   const TargetRegisterClass *RC = &X86::GR32RegClass;
3194 
3195   // Read a 32 bit value from the registers.
3196   if (nullptr == InFlag) {
3197     // When no physical register is present,
3198     // create an intermediate virtual register.
3199     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3200     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3201     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3202     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3203   } else {
3204     // When a physical register is available read the value from it and glue
3205     // the reads together.
3206     ArgValueLo =
3207       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
3208     *InFlag = ArgValueLo.getValue(2);
3209     ArgValueHi =
3210       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3211     *InFlag = ArgValueHi.getValue(2);
3212   }
3213 
3214   // Convert the i32 type into v32i1 type.
3215   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3216 
3217   // Convert the i32 type into v32i1 type.
3218   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3219 
3220   // Concatenate the two values together.
3221   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3222 }
3223 
3224 /// The function will lower a register of various sizes (8/16/32/64)
3225 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3226 /// \returns a DAG node contains the operand after lowering to mask type.
3227 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3228                                const EVT &ValLoc, const SDLoc &Dl,
3229                                SelectionDAG &DAG) {
3230   SDValue ValReturned = ValArg;
3231 
3232   if (ValVT == MVT::v1i1)
3233     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3234 
3235   if (ValVT == MVT::v64i1) {
3236     // In 32 bit machine, this case is handled by getv64i1Argument
3237     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
3238     // In 64 bit machine, There is no need to truncate the value only bitcast
3239   } else {
3240     MVT maskLen;
3241     switch (ValVT.getSimpleVT().SimpleTy) {
3242     case MVT::v8i1:
3243       maskLen = MVT::i8;
3244       break;
3245     case MVT::v16i1:
3246       maskLen = MVT::i16;
3247       break;
3248     case MVT::v32i1:
3249       maskLen = MVT::i32;
3250       break;
3251     default:
3252       llvm_unreachable("Expecting a vector of i1 types");
3253     }
3254 
3255     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3256   }
3257   return DAG.getBitcast(ValVT, ValReturned);
3258 }
3259 
3260 /// Lower the result values of a call into the
3261 /// appropriate copies out of appropriate physical registers.
3262 ///
3263 SDValue X86TargetLowering::LowerCallResult(
3264     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3265     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3266     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3267     uint32_t *RegMask) const {
3268 
3269   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3270   // Assign locations to each value returned by this call.
3271   SmallVector<CCValAssign, 16> RVLocs;
3272   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3273                  *DAG.getContext());
3274   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3275 
3276   // Copy all of the result registers out of their specified physreg.
3277   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3278        ++I, ++InsIndex) {
3279     CCValAssign &VA = RVLocs[I];
3280     EVT CopyVT = VA.getLocVT();
3281 
3282     // In some calling conventions we need to remove the used registers
3283     // from the register mask.
3284     if (RegMask) {
3285       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3286            SubRegs.isValid(); ++SubRegs)
3287         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3288     }
3289 
3290     // Report an error if there was an attempt to return FP values via XMM
3291     // registers.
3292     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3293       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3294       if (VA.getLocReg() == X86::XMM1)
3295         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3296       else
3297         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3298     } else if (!Subtarget.hasSSE2() &&
3299                X86::FR64XRegClass.contains(VA.getLocReg()) &&
3300                CopyVT == MVT::f64) {
3301       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3302       if (VA.getLocReg() == X86::XMM1)
3303         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3304       else
3305         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3306     }
3307 
3308     // If we prefer to use the value in xmm registers, copy it out as f80 and
3309     // use a truncate to move it from fp stack reg to xmm reg.
3310     bool RoundAfterCopy = false;
3311     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3312         isScalarFPTypeInSSEReg(VA.getValVT())) {
3313       if (!Subtarget.hasX87())
3314         report_fatal_error("X87 register return with X87 disabled");
3315       CopyVT = MVT::f80;
3316       RoundAfterCopy = (CopyVT != VA.getLocVT());
3317     }
3318 
3319     SDValue Val;
3320     if (VA.needsCustom()) {
3321       assert(VA.getValVT() == MVT::v64i1 &&
3322              "Currently the only custom case is when we split v64i1 to 2 regs");
3323       Val =
3324           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3325     } else {
3326       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3327                   .getValue(1);
3328       Val = Chain.getValue(0);
3329       InFlag = Chain.getValue(2);
3330     }
3331 
3332     if (RoundAfterCopy)
3333       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3334                         // This truncation won't change the value.
3335                         DAG.getIntPtrConstant(1, dl));
3336 
3337     if (VA.isExtInLoc()) {
3338       if (VA.getValVT().isVector() &&
3339           VA.getValVT().getScalarType() == MVT::i1 &&
3340           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3341            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3342         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3343         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3344       } else
3345         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3346     }
3347 
3348     if (VA.getLocInfo() == CCValAssign::BCvt)
3349       Val = DAG.getBitcast(VA.getValVT(), Val);
3350 
3351     InVals.push_back(Val);
3352   }
3353 
3354   return Chain;
3355 }
3356 
3357 //===----------------------------------------------------------------------===//
3358 //                C & StdCall & Fast Calling Convention implementation
3359 //===----------------------------------------------------------------------===//
3360 //  StdCall calling convention seems to be standard for many Windows' API
3361 //  routines and around. It differs from C calling convention just a little:
3362 //  callee should clean up the stack, not caller. Symbols should be also
3363 //  decorated in some fancy way :) It doesn't support any vector arguments.
3364 //  For info on fast calling convention see Fast Calling Convention (tail call)
3365 //  implementation LowerX86_32FastCCCallTo.
3366 
3367 /// Determines whether Args, either a set of outgoing arguments to a call, or a
3368 /// set of incoming args of a call, contains an sret pointer that the callee
3369 /// pops
3370 template <typename T>
3371 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3372                              const X86Subtarget &Subtarget) {
3373   // Not C++20 (yet), so no concepts available.
3374   static_assert(std::is_same<T, ISD::OutputArg>::value ||
3375                     std::is_same<T, ISD::InputArg>::value,
3376                 "requires ISD::OutputArg or ISD::InputArg");
3377 
3378   // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
3379   // for most compilations.
3380   if (!Subtarget.is32Bit())
3381     return false;
3382 
3383   if (Args.empty())
3384     return false;
3385 
3386   // Most calls do not have an sret argument, check the arg next.
3387   const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3388   if (!Flags.isSRet() || Flags.isInReg())
3389     return false;
3390 
3391   // The MSVCabi does not pop the sret.
3392   if (Subtarget.getTargetTriple().isOSMSVCRT())
3393     return false;
3394 
3395   // MCUs don't pop the sret
3396   if (Subtarget.isTargetMCU())
3397     return false;
3398 
3399   // Callee pops argument
3400   return true;
3401 }
3402 
3403 /// Make a copy of an aggregate at address specified by "Src" to address
3404 /// "Dst" with size and alignment information specified by the specific
3405 /// parameter attribute. The copy will be passed as a byval function parameter.
3406 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3407                                          SDValue Chain, ISD::ArgFlagsTy Flags,
3408                                          SelectionDAG &DAG, const SDLoc &dl) {
3409   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3410 
3411   return DAG.getMemcpy(
3412       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3413       /*isVolatile*/ false, /*AlwaysInline=*/true,
3414       /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3415 }
3416 
3417 /// Return true if the calling convention is one that we can guarantee TCO for.
3418 static bool canGuaranteeTCO(CallingConv::ID CC) {
3419   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3420           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3421           CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3422           CC == CallingConv::SwiftTail);
3423 }
3424 
3425 /// Return true if we might ever do TCO for calls with this calling convention.
3426 static bool mayTailCallThisCC(CallingConv::ID CC) {
3427   switch (CC) {
3428   // C calling conventions:
3429   case CallingConv::C:
3430   case CallingConv::Win64:
3431   case CallingConv::X86_64_SysV:
3432   // Callee pop conventions:
3433   case CallingConv::X86_ThisCall:
3434   case CallingConv::X86_StdCall:
3435   case CallingConv::X86_VectorCall:
3436   case CallingConv::X86_FastCall:
3437   // Swift:
3438   case CallingConv::Swift:
3439     return true;
3440   default:
3441     return canGuaranteeTCO(CC);
3442   }
3443 }
3444 
3445 /// Return true if the function is being made into a tailcall target by
3446 /// changing its ABI.
3447 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3448   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3449          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3450 }
3451 
3452 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3453   if (!CI->isTailCall())
3454     return false;
3455 
3456   CallingConv::ID CalleeCC = CI->getCallingConv();
3457   if (!mayTailCallThisCC(CalleeCC))
3458     return false;
3459 
3460   return true;
3461 }
3462 
3463 SDValue
3464 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3465                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3466                                     const SDLoc &dl, SelectionDAG &DAG,
3467                                     const CCValAssign &VA,
3468                                     MachineFrameInfo &MFI, unsigned i) const {
3469   // Create the nodes corresponding to a load from this parameter slot.
3470   ISD::ArgFlagsTy Flags = Ins[i].Flags;
3471   bool AlwaysUseMutable = shouldGuaranteeTCO(
3472       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3473   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3474   EVT ValVT;
3475   MVT PtrVT = getPointerTy(DAG.getDataLayout());
3476 
3477   // If value is passed by pointer we have address passed instead of the value
3478   // itself. No need to extend if the mask value and location share the same
3479   // absolute size.
3480   bool ExtendedInMem =
3481       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3482       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3483 
3484   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3485     ValVT = VA.getLocVT();
3486   else
3487     ValVT = VA.getValVT();
3488 
3489   // FIXME: For now, all byval parameter objects are marked mutable. This can be
3490   // changed with more analysis.
3491   // In case of tail call optimization mark all arguments mutable. Since they
3492   // could be overwritten by lowering of arguments in case of a tail call.
3493   if (Flags.isByVal()) {
3494     unsigned Bytes = Flags.getByValSize();
3495     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3496 
3497     // FIXME: For now, all byval parameter objects are marked as aliasing. This
3498     // can be improved with deeper analysis.
3499     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3500                                    /*isAliased=*/true);
3501     return DAG.getFrameIndex(FI, PtrVT);
3502   }
3503 
3504   EVT ArgVT = Ins[i].ArgVT;
3505 
3506   // If this is a vector that has been split into multiple parts, and the
3507   // scalar size of the parts don't match the vector element size, then we can't
3508   // elide the copy. The parts will have padding between them instead of being
3509   // packed like a vector.
3510   bool ScalarizedAndExtendedVector =
3511       ArgVT.isVector() && !VA.getLocVT().isVector() &&
3512       VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3513 
3514   // This is an argument in memory. We might be able to perform copy elision.
3515   // If the argument is passed directly in memory without any extension, then we
3516   // can perform copy elision. Large vector types, for example, may be passed
3517   // indirectly by pointer.
3518   if (Flags.isCopyElisionCandidate() &&
3519       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3520       !ScalarizedAndExtendedVector) {
3521     SDValue PartAddr;
3522     if (Ins[i].PartOffset == 0) {
3523       // If this is a one-part value or the first part of a multi-part value,
3524       // create a stack object for the entire argument value type and return a
3525       // load from our portion of it. This assumes that if the first part of an
3526       // argument is in memory, the rest will also be in memory.
3527       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3528                                      /*IsImmutable=*/false);
3529       PartAddr = DAG.getFrameIndex(FI, PtrVT);
3530       return DAG.getLoad(
3531           ValVT, dl, Chain, PartAddr,
3532           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3533     } else {
3534       // This is not the first piece of an argument in memory. See if there is
3535       // already a fixed stack object including this offset. If so, assume it
3536       // was created by the PartOffset == 0 branch above and create a load from
3537       // the appropriate offset into it.
3538       int64_t PartBegin = VA.getLocMemOffset();
3539       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3540       int FI = MFI.getObjectIndexBegin();
3541       for (; MFI.isFixedObjectIndex(FI); ++FI) {
3542         int64_t ObjBegin = MFI.getObjectOffset(FI);
3543         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3544         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3545           break;
3546       }
3547       if (MFI.isFixedObjectIndex(FI)) {
3548         SDValue Addr =
3549             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3550                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3551         return DAG.getLoad(
3552             ValVT, dl, Chain, Addr,
3553             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3554                                               Ins[i].PartOffset));
3555       }
3556     }
3557   }
3558 
3559   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3560                                  VA.getLocMemOffset(), isImmutable);
3561 
3562   // Set SExt or ZExt flag.
3563   if (VA.getLocInfo() == CCValAssign::ZExt) {
3564     MFI.setObjectZExt(FI, true);
3565   } else if (VA.getLocInfo() == CCValAssign::SExt) {
3566     MFI.setObjectSExt(FI, true);
3567   }
3568 
3569   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3570   SDValue Val = DAG.getLoad(
3571       ValVT, dl, Chain, FIN,
3572       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3573   return ExtendedInMem
3574              ? (VA.getValVT().isVector()
3575                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3576                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3577              : Val;
3578 }
3579 
3580 // FIXME: Get this from tablegen.
3581 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3582                                                 const X86Subtarget &Subtarget) {
3583   assert(Subtarget.is64Bit());
3584 
3585   if (Subtarget.isCallingConvWin64(CallConv)) {
3586     static const MCPhysReg GPR64ArgRegsWin64[] = {
3587       X86::RCX, X86::RDX, X86::R8,  X86::R9
3588     };
3589     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3590   }
3591 
3592   static const MCPhysReg GPR64ArgRegs64Bit[] = {
3593     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3594   };
3595   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3596 }
3597 
3598 // FIXME: Get this from tablegen.
3599 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3600                                                 CallingConv::ID CallConv,
3601                                                 const X86Subtarget &Subtarget) {
3602   assert(Subtarget.is64Bit());
3603   if (Subtarget.isCallingConvWin64(CallConv)) {
3604     // The XMM registers which might contain var arg parameters are shadowed
3605     // in their paired GPR.  So we only need to save the GPR to their home
3606     // slots.
3607     // TODO: __vectorcall will change this.
3608     return None;
3609   }
3610 
3611   bool isSoftFloat = Subtarget.useSoftFloat();
3612   if (isSoftFloat || !Subtarget.hasSSE1())
3613     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3614     // registers.
3615     return None;
3616 
3617   static const MCPhysReg XMMArgRegs64Bit[] = {
3618     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3619     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3620   };
3621   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3622 }
3623 
3624 #ifndef NDEBUG
3625 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3626   return llvm::is_sorted(
3627       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3628         return A.getValNo() < B.getValNo();
3629       });
3630 }
3631 #endif
3632 
3633 namespace {
3634 /// This is a helper class for lowering variable arguments parameters.
3635 class VarArgsLoweringHelper {
3636 public:
3637   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3638                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
3639                         CallingConv::ID CallConv, CCState &CCInfo)
3640       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3641         TheMachineFunction(DAG.getMachineFunction()),
3642         TheFunction(TheMachineFunction.getFunction()),
3643         FrameInfo(TheMachineFunction.getFrameInfo()),
3644         FrameLowering(*Subtarget.getFrameLowering()),
3645         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3646         CCInfo(CCInfo) {}
3647 
3648   // Lower variable arguments parameters.
3649   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3650 
3651 private:
3652   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3653 
3654   void forwardMustTailParameters(SDValue &Chain);
3655 
3656   bool is64Bit() const { return Subtarget.is64Bit(); }
3657   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3658 
3659   X86MachineFunctionInfo *FuncInfo;
3660   const SDLoc &DL;
3661   SelectionDAG &DAG;
3662   const X86Subtarget &Subtarget;
3663   MachineFunction &TheMachineFunction;
3664   const Function &TheFunction;
3665   MachineFrameInfo &FrameInfo;
3666   const TargetFrameLowering &FrameLowering;
3667   const TargetLowering &TargLowering;
3668   CallingConv::ID CallConv;
3669   CCState &CCInfo;
3670 };
3671 } // namespace
3672 
3673 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3674     SDValue &Chain, unsigned StackSize) {
3675   // If the function takes variable number of arguments, make a frame index for
3676   // the start of the first vararg value... for expansion of llvm.va_start. We
3677   // can skip this if there are no va_start calls.
3678   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3679                     CallConv != CallingConv::X86_ThisCall)) {
3680     FuncInfo->setVarArgsFrameIndex(
3681         FrameInfo.CreateFixedObject(1, StackSize, true));
3682   }
3683 
3684   // 64-bit calling conventions support varargs and register parameters, so we
3685   // have to do extra work to spill them in the prologue.
3686   if (is64Bit()) {
3687     // Find the first unallocated argument registers.
3688     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3689     ArrayRef<MCPhysReg> ArgXMMs =
3690         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3691     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3692     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3693 
3694     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3695            "SSE register cannot be used when SSE is disabled!");
3696 
3697     if (isWin64()) {
3698       // Get to the caller-allocated home save location.  Add 8 to account
3699       // for the return address.
3700       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3701       FuncInfo->setRegSaveFrameIndex(
3702           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3703       // Fixup to set vararg frame on shadow area (4 x i64).
3704       if (NumIntRegs < 4)
3705         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3706     } else {
3707       // For X86-64, if there are vararg parameters that are passed via
3708       // registers, then we must store them to their spots on the stack so
3709       // they may be loaded by dereferencing the result of va_next.
3710       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3711       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3712       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3713           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3714     }
3715 
3716     SmallVector<SDValue, 6>
3717         LiveGPRs; // list of SDValue for GPR registers keeping live input value
3718     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3719                                          // keeping live input value
3720     SDValue ALVal; // if applicable keeps SDValue for %al register
3721 
3722     // Gather all the live in physical registers.
3723     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3724       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3725       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3726     }
3727     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3728     if (!AvailableXmms.empty()) {
3729       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3730       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3731       for (MCPhysReg Reg : AvailableXmms) {
3732         // FastRegisterAllocator spills virtual registers at basic
3733         // block boundary. That leads to usages of xmm registers
3734         // outside of check for %al. Pass physical registers to
3735         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3736         TheMachineFunction.getRegInfo().addLiveIn(Reg);
3737         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3738       }
3739     }
3740 
3741     // Store the integer parameter registers.
3742     SmallVector<SDValue, 8> MemOps;
3743     SDValue RSFIN =
3744         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3745                           TargLowering.getPointerTy(DAG.getDataLayout()));
3746     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3747     for (SDValue Val : LiveGPRs) {
3748       SDValue FIN = DAG.getNode(ISD::ADD, DL,
3749                                 TargLowering.getPointerTy(DAG.getDataLayout()),
3750                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3751       SDValue Store =
3752           DAG.getStore(Val.getValue(1), DL, Val, FIN,
3753                        MachinePointerInfo::getFixedStack(
3754                            DAG.getMachineFunction(),
3755                            FuncInfo->getRegSaveFrameIndex(), Offset));
3756       MemOps.push_back(Store);
3757       Offset += 8;
3758     }
3759 
3760     // Now store the XMM (fp + vector) parameter registers.
3761     if (!LiveXMMRegs.empty()) {
3762       SmallVector<SDValue, 12> SaveXMMOps;
3763       SaveXMMOps.push_back(Chain);
3764       SaveXMMOps.push_back(ALVal);
3765       SaveXMMOps.push_back(RSFIN);
3766       SaveXMMOps.push_back(
3767           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3768       llvm::append_range(SaveXMMOps, LiveXMMRegs);
3769       MachineMemOperand *StoreMMO =
3770           DAG.getMachineFunction().getMachineMemOperand(
3771               MachinePointerInfo::getFixedStack(
3772                   DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
3773                   Offset),
3774               MachineMemOperand::MOStore, 128, Align(16));
3775       MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
3776                                                DL, DAG.getVTList(MVT::Other),
3777                                                SaveXMMOps, MVT::i8, StoreMMO));
3778     }
3779 
3780     if (!MemOps.empty())
3781       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3782   }
3783 }
3784 
3785 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3786   // Find the largest legal vector type.
3787   MVT VecVT = MVT::Other;
3788   // FIXME: Only some x86_32 calling conventions support AVX512.
3789   if (Subtarget.useAVX512Regs() &&
3790       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3791                      CallConv == CallingConv::Intel_OCL_BI)))
3792     VecVT = MVT::v16f32;
3793   else if (Subtarget.hasAVX())
3794     VecVT = MVT::v8f32;
3795   else if (Subtarget.hasSSE2())
3796     VecVT = MVT::v4f32;
3797 
3798   // We forward some GPRs and some vector types.
3799   SmallVector<MVT, 2> RegParmTypes;
3800   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3801   RegParmTypes.push_back(IntVT);
3802   if (VecVT != MVT::Other)
3803     RegParmTypes.push_back(VecVT);
3804 
3805   // Compute the set of forwarded registers. The rest are scratch.
3806   SmallVectorImpl<ForwardedRegister> &Forwards =
3807       FuncInfo->getForwardedMustTailRegParms();
3808   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3809 
3810   // Forward AL for SysV x86_64 targets, since it is used for varargs.
3811   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3812     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3813     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3814   }
3815 
3816   // Copy all forwards from physical to virtual registers.
3817   for (ForwardedRegister &FR : Forwards) {
3818     // FIXME: Can we use a less constrained schedule?
3819     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3820     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3821         TargLowering.getRegClassFor(FR.VT));
3822     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3823   }
3824 }
3825 
3826 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3827                                                    unsigned StackSize) {
3828   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3829   // If necessary, it would be set into the correct value later.
3830   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3831   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3832 
3833   if (FrameInfo.hasVAStart())
3834     createVarArgAreaAndStoreRegisters(Chain, StackSize);
3835 
3836   if (FrameInfo.hasMustTailInVarArgFunc())
3837     forwardMustTailParameters(Chain);
3838 }
3839 
3840 SDValue X86TargetLowering::LowerFormalArguments(
3841     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3842     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3843     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3844   MachineFunction &MF = DAG.getMachineFunction();
3845   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3846 
3847   const Function &F = MF.getFunction();
3848   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3849       F.getName() == "main")
3850     FuncInfo->setForceFramePointer(true);
3851 
3852   MachineFrameInfo &MFI = MF.getFrameInfo();
3853   bool Is64Bit = Subtarget.is64Bit();
3854   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3855 
3856   assert(
3857       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
3858       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3859 
3860   // Assign locations to all of the incoming arguments.
3861   SmallVector<CCValAssign, 16> ArgLocs;
3862   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3863 
3864   // Allocate shadow area for Win64.
3865   if (IsWin64)
3866     CCInfo.AllocateStack(32, Align(8));
3867 
3868   CCInfo.AnalyzeArguments(Ins, CC_X86);
3869 
3870   // In vectorcall calling convention a second pass is required for the HVA
3871   // types.
3872   if (CallingConv::X86_VectorCall == CallConv) {
3873     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3874   }
3875 
3876   // The next loop assumes that the locations are in the same order of the
3877   // input arguments.
3878   assert(isSortedByValueNo(ArgLocs) &&
3879          "Argument Location list must be sorted before lowering");
3880 
3881   SDValue ArgValue;
3882   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3883        ++I, ++InsIndex) {
3884     assert(InsIndex < Ins.size() && "Invalid Ins index");
3885     CCValAssign &VA = ArgLocs[I];
3886 
3887     if (VA.isRegLoc()) {
3888       EVT RegVT = VA.getLocVT();
3889       if (VA.needsCustom()) {
3890         assert(
3891             VA.getValVT() == MVT::v64i1 &&
3892             "Currently the only custom case is when we split v64i1 to 2 regs");
3893 
3894         // v64i1 values, in regcall calling convention, that are
3895         // compiled to 32 bit arch, are split up into two registers.
3896         ArgValue =
3897             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3898       } else {
3899         const TargetRegisterClass *RC;
3900         if (RegVT == MVT::i8)
3901           RC = &X86::GR8RegClass;
3902         else if (RegVT == MVT::i16)
3903           RC = &X86::GR16RegClass;
3904         else if (RegVT == MVT::i32)
3905           RC = &X86::GR32RegClass;
3906         else if (Is64Bit && RegVT == MVT::i64)
3907           RC = &X86::GR64RegClass;
3908         else if (RegVT == MVT::f16)
3909           RC = &X86::FR16XRegClass;
3910         else if (RegVT == MVT::f32)
3911           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3912         else if (RegVT == MVT::f64)
3913           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3914         else if (RegVT == MVT::f80)
3915           RC = &X86::RFP80RegClass;
3916         else if (RegVT == MVT::f128)
3917           RC = &X86::VR128RegClass;
3918         else if (RegVT.is512BitVector())
3919           RC = &X86::VR512RegClass;
3920         else if (RegVT.is256BitVector())
3921           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3922         else if (RegVT.is128BitVector())
3923           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3924         else if (RegVT == MVT::x86mmx)
3925           RC = &X86::VR64RegClass;
3926         else if (RegVT == MVT::v1i1)
3927           RC = &X86::VK1RegClass;
3928         else if (RegVT == MVT::v8i1)
3929           RC = &X86::VK8RegClass;
3930         else if (RegVT == MVT::v16i1)
3931           RC = &X86::VK16RegClass;
3932         else if (RegVT == MVT::v32i1)
3933           RC = &X86::VK32RegClass;
3934         else if (RegVT == MVT::v64i1)
3935           RC = &X86::VK64RegClass;
3936         else
3937           llvm_unreachable("Unknown argument type!");
3938 
3939         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3940         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3941       }
3942 
3943       // If this is an 8 or 16-bit value, it is really passed promoted to 32
3944       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3945       // right size.
3946       if (VA.getLocInfo() == CCValAssign::SExt)
3947         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3948                                DAG.getValueType(VA.getValVT()));
3949       else if (VA.getLocInfo() == CCValAssign::ZExt)
3950         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3951                                DAG.getValueType(VA.getValVT()));
3952       else if (VA.getLocInfo() == CCValAssign::BCvt)
3953         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3954 
3955       if (VA.isExtInLoc()) {
3956         // Handle MMX values passed in XMM regs.
3957         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3958           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3959         else if (VA.getValVT().isVector() &&
3960                  VA.getValVT().getScalarType() == MVT::i1 &&
3961                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3962                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3963           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3964           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3965         } else
3966           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3967       }
3968     } else {
3969       assert(VA.isMemLoc());
3970       ArgValue =
3971           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3972     }
3973 
3974     // If value is passed via pointer - do a load.
3975     if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3976       ArgValue =
3977           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3978 
3979     InVals.push_back(ArgValue);
3980   }
3981 
3982   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3983     if (Ins[I].Flags.isSwiftAsync()) {
3984       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3985       if (Subtarget.is64Bit())
3986         X86FI->setHasSwiftAsyncContext(true);
3987       else {
3988         int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3989         X86FI->setSwiftAsyncContextFrameIdx(FI);
3990         SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3991                                   DAG.getFrameIndex(FI, MVT::i32),
3992                                   MachinePointerInfo::getFixedStack(MF, FI));
3993         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3994       }
3995     }
3996 
3997     // Swift calling convention does not require we copy the sret argument
3998     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3999     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4000       continue;
4001 
4002     // All x86 ABIs require that for returning structs by value we copy the
4003     // sret argument into %rax/%eax (depending on ABI) for the return. Save
4004     // the argument into a virtual register so that we can access it from the
4005     // return points.
4006     if (Ins[I].Flags.isSRet()) {
4007       assert(!FuncInfo->getSRetReturnReg() &&
4008              "SRet return has already been set");
4009       MVT PtrTy = getPointerTy(DAG.getDataLayout());
4010       Register Reg =
4011           MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4012       FuncInfo->setSRetReturnReg(Reg);
4013       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4014       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4015       break;
4016     }
4017   }
4018 
4019   unsigned StackSize = CCInfo.getNextStackOffset();
4020   // Align stack specially for tail calls.
4021   if (shouldGuaranteeTCO(CallConv,
4022                          MF.getTarget().Options.GuaranteedTailCallOpt))
4023     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4024 
4025   if (IsVarArg)
4026     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4027         .lowerVarArgsParameters(Chain, StackSize);
4028 
4029   // Some CCs need callee pop.
4030   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4031                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
4032     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4033   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4034     // X86 interrupts must pop the error code (and the alignment padding) if
4035     // present.
4036     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4037   } else {
4038     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4039     // If this is an sret function, the return should pop the hidden pointer.
4040     if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4041       FuncInfo->setBytesToPopOnReturn(4);
4042   }
4043 
4044   if (!Is64Bit) {
4045     // RegSaveFrameIndex is X86-64 only.
4046     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4047   }
4048 
4049   FuncInfo->setArgumentStackSize(StackSize);
4050 
4051   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4052     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4053     if (Personality == EHPersonality::CoreCLR) {
4054       assert(Is64Bit);
4055       // TODO: Add a mechanism to frame lowering that will allow us to indicate
4056       // that we'd prefer this slot be allocated towards the bottom of the frame
4057       // (i.e. near the stack pointer after allocating the frame).  Every
4058       // funclet needs a copy of this slot in its (mostly empty) frame, and the
4059       // offset from the bottom of this and each funclet's frame must be the
4060       // same, so the size of funclets' (mostly empty) frames is dictated by
4061       // how far this slot is from the bottom (since they allocate just enough
4062       // space to accommodate holding this slot at the correct offset).
4063       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4064       EHInfo->PSPSymFrameIdx = PSPSymFI;
4065     }
4066   }
4067 
4068   if (CallConv == CallingConv::X86_RegCall ||
4069       F.hasFnAttribute("no_caller_saved_registers")) {
4070     MachineRegisterInfo &MRI = MF.getRegInfo();
4071     for (std::pair<Register, Register> Pair : MRI.liveins())
4072       MRI.disableCalleeSavedRegister(Pair.first);
4073   }
4074 
4075   return Chain;
4076 }
4077 
4078 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4079                                             SDValue Arg, const SDLoc &dl,
4080                                             SelectionDAG &DAG,
4081                                             const CCValAssign &VA,
4082                                             ISD::ArgFlagsTy Flags,
4083                                             bool isByVal) const {
4084   unsigned LocMemOffset = VA.getLocMemOffset();
4085   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4086   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4087                        StackPtr, PtrOff);
4088   if (isByVal)
4089     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4090 
4091   return DAG.getStore(
4092       Chain, dl, Arg, PtrOff,
4093       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
4094 }
4095 
4096 /// Emit a load of return address if tail call
4097 /// optimization is performed and it is required.
4098 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4099     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4100     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4101   // Adjust the Return address stack slot.
4102   EVT VT = getPointerTy(DAG.getDataLayout());
4103   OutRetAddr = getReturnAddressFrameIndex(DAG);
4104 
4105   // Load the "old" Return address.
4106   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4107   return SDValue(OutRetAddr.getNode(), 1);
4108 }
4109 
4110 /// Emit a store of the return address if tail call
4111 /// optimization is performed and it is required (FPDiff!=0).
4112 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4113                                         SDValue Chain, SDValue RetAddrFrIdx,
4114                                         EVT PtrVT, unsigned SlotSize,
4115                                         int FPDiff, const SDLoc &dl) {
4116   // Store the return address to the appropriate stack slot.
4117   if (!FPDiff) return Chain;
4118   // Calculate the new stack slot for the return address.
4119   int NewReturnAddrFI =
4120     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4121                                          false);
4122   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4123   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4124                        MachinePointerInfo::getFixedStack(
4125                            DAG.getMachineFunction(), NewReturnAddrFI));
4126   return Chain;
4127 }
4128 
4129 /// Returns a vector_shuffle mask for an movs{s|d}, movd
4130 /// operation of specified width.
4131 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4132                        SDValue V2) {
4133   unsigned NumElems = VT.getVectorNumElements();
4134   SmallVector<int, 8> Mask;
4135   Mask.push_back(NumElems);
4136   for (unsigned i = 1; i != NumElems; ++i)
4137     Mask.push_back(i);
4138   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4139 }
4140 
4141 SDValue
4142 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4143                              SmallVectorImpl<SDValue> &InVals) const {
4144   SelectionDAG &DAG                     = CLI.DAG;
4145   SDLoc &dl                             = CLI.DL;
4146   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4147   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
4148   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
4149   SDValue Chain                         = CLI.Chain;
4150   SDValue Callee                        = CLI.Callee;
4151   CallingConv::ID CallConv              = CLI.CallConv;
4152   bool &isTailCall                      = CLI.IsTailCall;
4153   bool isVarArg                         = CLI.IsVarArg;
4154   const auto *CB                        = CLI.CB;
4155 
4156   MachineFunction &MF = DAG.getMachineFunction();
4157   bool Is64Bit        = Subtarget.is64Bit();
4158   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
4159   bool IsSibcall      = false;
4160   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4161       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4162   bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4163   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4164   bool HasNCSR = (CB && isa<CallInst>(CB) &&
4165                   CB->hasFnAttr("no_caller_saved_registers"));
4166   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4167   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4168   const Module *M = MF.getMMI().getModule();
4169   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4170 
4171   MachineFunction::CallSiteInfo CSInfo;
4172   if (CallConv == CallingConv::X86_INTR)
4173     report_fatal_error("X86 interrupts may not be called directly");
4174 
4175   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4176   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4177     // If we are using a GOT, disable tail calls to external symbols with
4178     // default visibility. Tail calling such a symbol requires using a GOT
4179     // relocation, which forces early binding of the symbol. This breaks code
4180     // that require lazy function symbol resolution. Using musttail or
4181     // GuaranteedTailCallOpt will override this.
4182     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4183     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4184                G->getGlobal()->hasDefaultVisibility()))
4185       isTailCall = false;
4186   }
4187 
4188   if (isTailCall && !IsMustTail) {
4189     // Check if it's really possible to do a tail call.
4190     isTailCall = IsEligibleForTailCallOptimization(
4191         Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4192         Ins, DAG);
4193 
4194     // Sibcalls are automatically detected tailcalls which do not require
4195     // ABI changes.
4196     if (!IsGuaranteeTCO && isTailCall)
4197       IsSibcall = true;
4198 
4199     if (isTailCall)
4200       ++NumTailCalls;
4201   }
4202 
4203   if (IsMustTail && !isTailCall)
4204     report_fatal_error("failed to perform tail call elimination on a call "
4205                        "site marked musttail");
4206 
4207   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
4208          "Var args not supported with calling convention fastcc, ghc or hipe");
4209 
4210   // Analyze operands of the call, assigning locations to each operand.
4211   SmallVector<CCValAssign, 16> ArgLocs;
4212   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4213 
4214   // Allocate shadow area for Win64.
4215   if (IsWin64)
4216     CCInfo.AllocateStack(32, Align(8));
4217 
4218   CCInfo.AnalyzeArguments(Outs, CC_X86);
4219 
4220   // In vectorcall calling convention a second pass is required for the HVA
4221   // types.
4222   if (CallingConv::X86_VectorCall == CallConv) {
4223     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4224   }
4225 
4226   // Get a count of how many bytes are to be pushed on the stack.
4227   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4228   if (IsSibcall)
4229     // This is a sibcall. The memory operands are available in caller's
4230     // own caller's stack.
4231     NumBytes = 0;
4232   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4233     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4234 
4235   int FPDiff = 0;
4236   if (isTailCall &&
4237       shouldGuaranteeTCO(CallConv,
4238                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
4239     // Lower arguments at fp - stackoffset + fpdiff.
4240     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4241 
4242     FPDiff = NumBytesCallerPushed - NumBytes;
4243 
4244     // Set the delta of movement of the returnaddr stackslot.
4245     // But only set if delta is greater than previous delta.
4246     if (FPDiff < X86Info->getTCReturnAddrDelta())
4247       X86Info->setTCReturnAddrDelta(FPDiff);
4248   }
4249 
4250   unsigned NumBytesToPush = NumBytes;
4251   unsigned NumBytesToPop = NumBytes;
4252 
4253   // If we have an inalloca argument, all stack space has already been allocated
4254   // for us and be right at the top of the stack.  We don't support multiple
4255   // arguments passed in memory when using inalloca.
4256   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4257     NumBytesToPush = 0;
4258     if (!ArgLocs.back().isMemLoc())
4259       report_fatal_error("cannot use inalloca attribute on a register "
4260                          "parameter");
4261     if (ArgLocs.back().getLocMemOffset() != 0)
4262       report_fatal_error("any parameter with the inalloca attribute must be "
4263                          "the only memory argument");
4264   } else if (CLI.IsPreallocated) {
4265     assert(ArgLocs.back().isMemLoc() &&
4266            "cannot use preallocated attribute on a register "
4267            "parameter");
4268     SmallVector<size_t, 4> PreallocatedOffsets;
4269     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4270       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4271         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4272       }
4273     }
4274     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4275     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4276     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4277     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4278     NumBytesToPush = 0;
4279   }
4280 
4281   if (!IsSibcall && !IsMustTail)
4282     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4283                                  NumBytes - NumBytesToPush, dl);
4284 
4285   SDValue RetAddrFrIdx;
4286   // Load return address for tail calls.
4287   if (isTailCall && FPDiff)
4288     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4289                                     Is64Bit, FPDiff, dl);
4290 
4291   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4292   SmallVector<SDValue, 8> MemOpChains;
4293   SDValue StackPtr;
4294 
4295   // The next loop assumes that the locations are in the same order of the
4296   // input arguments.
4297   assert(isSortedByValueNo(ArgLocs) &&
4298          "Argument Location list must be sorted before lowering");
4299 
4300   // Walk the register/memloc assignments, inserting copies/loads.  In the case
4301   // of tail call optimization arguments are handle later.
4302   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4303   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4304        ++I, ++OutIndex) {
4305     assert(OutIndex < Outs.size() && "Invalid Out index");
4306     // Skip inalloca/preallocated arguments, they have already been written.
4307     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4308     if (Flags.isInAlloca() || Flags.isPreallocated())
4309       continue;
4310 
4311     CCValAssign &VA = ArgLocs[I];
4312     EVT RegVT = VA.getLocVT();
4313     SDValue Arg = OutVals[OutIndex];
4314     bool isByVal = Flags.isByVal();
4315 
4316     // Promote the value if needed.
4317     switch (VA.getLocInfo()) {
4318     default: llvm_unreachable("Unknown loc info!");
4319     case CCValAssign::Full: break;
4320     case CCValAssign::SExt:
4321       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4322       break;
4323     case CCValAssign::ZExt:
4324       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4325       break;
4326     case CCValAssign::AExt:
4327       if (Arg.getValueType().isVector() &&
4328           Arg.getValueType().getVectorElementType() == MVT::i1)
4329         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4330       else if (RegVT.is128BitVector()) {
4331         // Special case: passing MMX values in XMM registers.
4332         Arg = DAG.getBitcast(MVT::i64, Arg);
4333         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4334         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4335       } else
4336         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4337       break;
4338     case CCValAssign::BCvt:
4339       Arg = DAG.getBitcast(RegVT, Arg);
4340       break;
4341     case CCValAssign::Indirect: {
4342       if (isByVal) {
4343         // Memcpy the argument to a temporary stack slot to prevent
4344         // the caller from seeing any modifications the callee may make
4345         // as guaranteed by the `byval` attribute.
4346         int FrameIdx = MF.getFrameInfo().CreateStackObject(
4347             Flags.getByValSize(),
4348             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4349         SDValue StackSlot =
4350             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4351         Chain =
4352             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4353         // From now on treat this as a regular pointer
4354         Arg = StackSlot;
4355         isByVal = false;
4356       } else {
4357         // Store the argument.
4358         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4359         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4360         Chain = DAG.getStore(
4361             Chain, dl, Arg, SpillSlot,
4362             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4363         Arg = SpillSlot;
4364       }
4365       break;
4366     }
4367     }
4368 
4369     if (VA.needsCustom()) {
4370       assert(VA.getValVT() == MVT::v64i1 &&
4371              "Currently the only custom case is when we split v64i1 to 2 regs");
4372       // Split v64i1 value into two registers
4373       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4374     } else if (VA.isRegLoc()) {
4375       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4376       const TargetOptions &Options = DAG.getTarget().Options;
4377       if (Options.EmitCallSiteInfo)
4378         CSInfo.emplace_back(VA.getLocReg(), I);
4379       if (isVarArg && IsWin64) {
4380         // Win64 ABI requires argument XMM reg to be copied to the corresponding
4381         // shadow reg if callee is a varargs function.
4382         Register ShadowReg;
4383         switch (VA.getLocReg()) {
4384         case X86::XMM0: ShadowReg = X86::RCX; break;
4385         case X86::XMM1: ShadowReg = X86::RDX; break;
4386         case X86::XMM2: ShadowReg = X86::R8; break;
4387         case X86::XMM3: ShadowReg = X86::R9; break;
4388         }
4389         if (ShadowReg)
4390           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4391       }
4392     } else if (!IsSibcall && (!isTailCall || isByVal)) {
4393       assert(VA.isMemLoc());
4394       if (!StackPtr.getNode())
4395         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4396                                       getPointerTy(DAG.getDataLayout()));
4397       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4398                                              dl, DAG, VA, Flags, isByVal));
4399     }
4400   }
4401 
4402   if (!MemOpChains.empty())
4403     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4404 
4405   if (Subtarget.isPICStyleGOT()) {
4406     // ELF / PIC requires GOT in the EBX register before function calls via PLT
4407     // GOT pointer (except regcall).
4408     if (!isTailCall) {
4409       // Indirect call with RegCall calling convertion may use up all the
4410       // general registers, so it is not suitable to bind EBX reister for
4411       // GOT address, just let register allocator handle it.
4412       if (CallConv != CallingConv::X86_RegCall)
4413         RegsToPass.push_back(std::make_pair(
4414           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4415                                           getPointerTy(DAG.getDataLayout()))));
4416     } else {
4417       // If we are tail calling and generating PIC/GOT style code load the
4418       // address of the callee into ECX. The value in ecx is used as target of
4419       // the tail jump. This is done to circumvent the ebx/callee-saved problem
4420       // for tail calls on PIC/GOT architectures. Normally we would just put the
4421       // address of GOT into ebx and then call target@PLT. But for tail calls
4422       // ebx would be restored (since ebx is callee saved) before jumping to the
4423       // target@PLT.
4424 
4425       // Note: The actual moving to ECX is done further down.
4426       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4427       if (G && !G->getGlobal()->hasLocalLinkage() &&
4428           G->getGlobal()->hasDefaultVisibility())
4429         Callee = LowerGlobalAddress(Callee, DAG);
4430       else if (isa<ExternalSymbolSDNode>(Callee))
4431         Callee = LowerExternalSymbol(Callee, DAG);
4432     }
4433   }
4434 
4435   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4436       (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4437     // From AMD64 ABI document:
4438     // For calls that may call functions that use varargs or stdargs
4439     // (prototype-less calls or calls to functions containing ellipsis (...) in
4440     // the declaration) %al is used as hidden argument to specify the number
4441     // of SSE registers used. The contents of %al do not need to match exactly
4442     // the number of registers, but must be an ubound on the number of SSE
4443     // registers used and is in the range 0 - 8 inclusive.
4444 
4445     // Count the number of XMM registers allocated.
4446     static const MCPhysReg XMMArgRegs[] = {
4447       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4448       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4449     };
4450     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4451     assert((Subtarget.hasSSE1() || !NumXMMRegs)
4452            && "SSE registers cannot be used when SSE is disabled");
4453     RegsToPass.push_back(std::make_pair(Register(X86::AL),
4454                                         DAG.getConstant(NumXMMRegs, dl,
4455                                                         MVT::i8)));
4456   }
4457 
4458   if (isVarArg && IsMustTail) {
4459     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4460     for (const auto &F : Forwards) {
4461       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4462       RegsToPass.push_back(std::make_pair(F.PReg, Val));
4463     }
4464   }
4465 
4466   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
4467   // don't need this because the eligibility check rejects calls that require
4468   // shuffling arguments passed in memory.
4469   if (!IsSibcall && isTailCall) {
4470     // Force all the incoming stack arguments to be loaded from the stack
4471     // before any new outgoing arguments are stored to the stack, because the
4472     // outgoing stack slots may alias the incoming argument stack slots, and
4473     // the alias isn't otherwise explicit. This is slightly more conservative
4474     // than necessary, because it means that each store effectively depends
4475     // on every argument instead of just those arguments it would clobber.
4476     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4477 
4478     SmallVector<SDValue, 8> MemOpChains2;
4479     SDValue FIN;
4480     int FI = 0;
4481     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4482          ++I, ++OutsIndex) {
4483       CCValAssign &VA = ArgLocs[I];
4484 
4485       if (VA.isRegLoc()) {
4486         if (VA.needsCustom()) {
4487           assert((CallConv == CallingConv::X86_RegCall) &&
4488                  "Expecting custom case only in regcall calling convention");
4489           // This means that we are in special case where one argument was
4490           // passed through two register locations - Skip the next location
4491           ++I;
4492         }
4493 
4494         continue;
4495       }
4496 
4497       assert(VA.isMemLoc());
4498       SDValue Arg = OutVals[OutsIndex];
4499       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4500       // Skip inalloca/preallocated arguments.  They don't require any work.
4501       if (Flags.isInAlloca() || Flags.isPreallocated())
4502         continue;
4503       // Create frame index.
4504       int32_t Offset = VA.getLocMemOffset()+FPDiff;
4505       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4506       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4507       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4508 
4509       if (Flags.isByVal()) {
4510         // Copy relative to framepointer.
4511         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4512         if (!StackPtr.getNode())
4513           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4514                                         getPointerTy(DAG.getDataLayout()));
4515         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4516                              StackPtr, Source);
4517 
4518         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4519                                                          ArgChain,
4520                                                          Flags, DAG, dl));
4521       } else {
4522         // Store relative to framepointer.
4523         MemOpChains2.push_back(DAG.getStore(
4524             ArgChain, dl, Arg, FIN,
4525             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4526       }
4527     }
4528 
4529     if (!MemOpChains2.empty())
4530       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4531 
4532     // Store the return address to the appropriate stack slot.
4533     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4534                                      getPointerTy(DAG.getDataLayout()),
4535                                      RegInfo->getSlotSize(), FPDiff, dl);
4536   }
4537 
4538   // Build a sequence of copy-to-reg nodes chained together with token chain
4539   // and flag operands which copy the outgoing args into registers.
4540   SDValue InFlag;
4541   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4542     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4543                              RegsToPass[i].second, InFlag);
4544     InFlag = Chain.getValue(1);
4545   }
4546 
4547   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4548     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
4549     // In the 64-bit large code model, we have to make all calls
4550     // through a register, since the call instruction's 32-bit
4551     // pc-relative offset may not be large enough to hold the whole
4552     // address.
4553   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4554              Callee->getOpcode() == ISD::ExternalSymbol) {
4555     // Lower direct calls to global addresses and external symbols. Setting
4556     // ForCall to true here has the effect of removing WrapperRIP when possible
4557     // to allow direct calls to be selected without first materializing the
4558     // address into a register.
4559     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4560   } else if (Subtarget.isTarget64BitILP32() &&
4561              Callee.getValueType() == MVT::i32) {
4562     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4563     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4564   }
4565 
4566   // Returns a chain & a flag for retval copy to use.
4567   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4568   SmallVector<SDValue, 8> Ops;
4569 
4570   if (!IsSibcall && isTailCall && !IsMustTail) {
4571     Chain = DAG.getCALLSEQ_END(Chain,
4572                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4573                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4574     InFlag = Chain.getValue(1);
4575   }
4576 
4577   Ops.push_back(Chain);
4578   Ops.push_back(Callee);
4579 
4580   if (isTailCall)
4581     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4582 
4583   // Add argument registers to the end of the list so that they are known live
4584   // into the call.
4585   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4586     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4587                                   RegsToPass[i].second.getValueType()));
4588 
4589   // Add a register mask operand representing the call-preserved registers.
4590   const uint32_t *Mask = [&]() {
4591     auto AdaptedCC = CallConv;
4592     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4593     // use X86_INTR calling convention because it has the same CSR mask
4594     // (same preserved registers).
4595     if (HasNCSR)
4596       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4597     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4598     // to use the CSR_NoRegs_RegMask.
4599     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4600       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4601     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4602   }();
4603   assert(Mask && "Missing call preserved mask for calling convention");
4604 
4605   // If this is an invoke in a 32-bit function using a funclet-based
4606   // personality, assume the function clobbers all registers. If an exception
4607   // is thrown, the runtime will not restore CSRs.
4608   // FIXME: Model this more precisely so that we can register allocate across
4609   // the normal edge and spill and fill across the exceptional edge.
4610   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4611     const Function &CallerFn = MF.getFunction();
4612     EHPersonality Pers =
4613         CallerFn.hasPersonalityFn()
4614             ? classifyEHPersonality(CallerFn.getPersonalityFn())
4615             : EHPersonality::Unknown;
4616     if (isFuncletEHPersonality(Pers))
4617       Mask = RegInfo->getNoPreservedMask();
4618   }
4619 
4620   // Define a new register mask from the existing mask.
4621   uint32_t *RegMask = nullptr;
4622 
4623   // In some calling conventions we need to remove the used physical registers
4624   // from the reg mask.
4625   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4626     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4627 
4628     // Allocate a new Reg Mask and copy Mask.
4629     RegMask = MF.allocateRegMask();
4630     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4631     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4632 
4633     // Make sure all sub registers of the argument registers are reset
4634     // in the RegMask.
4635     for (auto const &RegPair : RegsToPass)
4636       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4637            SubRegs.isValid(); ++SubRegs)
4638         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4639 
4640     // Create the RegMask Operand according to our updated mask.
4641     Ops.push_back(DAG.getRegisterMask(RegMask));
4642   } else {
4643     // Create the RegMask Operand according to the static mask.
4644     Ops.push_back(DAG.getRegisterMask(Mask));
4645   }
4646 
4647   if (InFlag.getNode())
4648     Ops.push_back(InFlag);
4649 
4650   if (isTailCall) {
4651     // We used to do:
4652     //// If this is the first return lowered for this function, add the regs
4653     //// to the liveout set for the function.
4654     // This isn't right, although it's probably harmless on x86; liveouts
4655     // should be computed from returns not tail calls.  Consider a void
4656     // function making a tail call to a function returning int.
4657     MF.getFrameInfo().setHasTailCall();
4658     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4659     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4660     return Ret;
4661   }
4662 
4663   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4664     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4665   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4666     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4667     // expanded to the call, directly followed by a special marker sequence and
4668     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4669     assert(!isTailCall &&
4670            "tail calls cannot be marked with clang.arc.attachedcall");
4671     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
4672 
4673     // Add a target global address for the retainRV/claimRV runtime function
4674     // just before the call target.
4675     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
4676     auto PtrVT = getPointerTy(DAG.getDataLayout());
4677     auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
4678     Ops.insert(Ops.begin() + 1, GA);
4679     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4680   } else {
4681     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4682   }
4683 
4684   InFlag = Chain.getValue(1);
4685   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4686   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4687 
4688   // Save heapallocsite metadata.
4689   if (CLI.CB)
4690     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4691       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4692 
4693   // Create the CALLSEQ_END node.
4694   unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
4695   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4696                        DAG.getTarget().Options.GuaranteedTailCallOpt))
4697     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
4698   else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
4699     // If this call passes a struct-return pointer, the callee
4700     // pops that struct pointer.
4701     NumBytesForCalleeToPop = 4;
4702 
4703   // Returns a flag for retval copy to use.
4704   if (!IsSibcall) {
4705     Chain = DAG.getCALLSEQ_END(Chain,
4706                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4707                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4708                                                      true),
4709                                InFlag, dl);
4710     InFlag = Chain.getValue(1);
4711   }
4712 
4713   // Handle result values, copying them out of physregs into vregs that we
4714   // return.
4715   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4716                          InVals, RegMask);
4717 }
4718 
4719 //===----------------------------------------------------------------------===//
4720 //                Fast Calling Convention (tail call) implementation
4721 //===----------------------------------------------------------------------===//
4722 
4723 //  Like std call, callee cleans arguments, convention except that ECX is
4724 //  reserved for storing the tail called function address. Only 2 registers are
4725 //  free for argument passing (inreg). Tail call optimization is performed
4726 //  provided:
4727 //                * tailcallopt is enabled
4728 //                * caller/callee are fastcc
4729 //  On X86_64 architecture with GOT-style position independent code only local
4730 //  (within module) calls are supported at the moment.
4731 //  To keep the stack aligned according to platform abi the function
4732 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
4733 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4734 //  If a tail called function callee has more arguments than the caller the
4735 //  caller needs to make sure that there is room to move the RETADDR to. This is
4736 //  achieved by reserving an area the size of the argument delta right after the
4737 //  original RETADDR, but before the saved framepointer or the spilled registers
4738 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4739 //  stack layout:
4740 //    arg1
4741 //    arg2
4742 //    RETADDR
4743 //    [ new RETADDR
4744 //      move area ]
4745 //    (possible EBP)
4746 //    ESI
4747 //    EDI
4748 //    local1 ..
4749 
4750 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4751 /// requirement.
4752 unsigned
4753 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4754                                                SelectionDAG &DAG) const {
4755   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4756   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4757   assert(StackSize % SlotSize == 0 &&
4758          "StackSize must be a multiple of SlotSize");
4759   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4760 }
4761 
4762 /// Return true if the given stack call argument is already available in the
4763 /// same position (relatively) of the caller's incoming argument stack.
4764 static
4765 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4766                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4767                          const X86InstrInfo *TII, const CCValAssign &VA) {
4768   unsigned Bytes = Arg.getValueSizeInBits() / 8;
4769 
4770   for (;;) {
4771     // Look through nodes that don't alter the bits of the incoming value.
4772     unsigned Op = Arg.getOpcode();
4773     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4774       Arg = Arg.getOperand(0);
4775       continue;
4776     }
4777     if (Op == ISD::TRUNCATE) {
4778       const SDValue &TruncInput = Arg.getOperand(0);
4779       if (TruncInput.getOpcode() == ISD::AssertZext &&
4780           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4781               Arg.getValueType()) {
4782         Arg = TruncInput.getOperand(0);
4783         continue;
4784       }
4785     }
4786     break;
4787   }
4788 
4789   int FI = INT_MAX;
4790   if (Arg.getOpcode() == ISD::CopyFromReg) {
4791     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4792     if (!VR.isVirtual())
4793       return false;
4794     MachineInstr *Def = MRI->getVRegDef(VR);
4795     if (!Def)
4796       return false;
4797     if (!Flags.isByVal()) {
4798       if (!TII->isLoadFromStackSlot(*Def, FI))
4799         return false;
4800     } else {
4801       unsigned Opcode = Def->getOpcode();
4802       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4803            Opcode == X86::LEA64_32r) &&
4804           Def->getOperand(1).isFI()) {
4805         FI = Def->getOperand(1).getIndex();
4806         Bytes = Flags.getByValSize();
4807       } else
4808         return false;
4809     }
4810   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4811     if (Flags.isByVal())
4812       // ByVal argument is passed in as a pointer but it's now being
4813       // dereferenced. e.g.
4814       // define @foo(%struct.X* %A) {
4815       //   tail call @bar(%struct.X* byval %A)
4816       // }
4817       return false;
4818     SDValue Ptr = Ld->getBasePtr();
4819     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4820     if (!FINode)
4821       return false;
4822     FI = FINode->getIndex();
4823   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4824     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4825     FI = FINode->getIndex();
4826     Bytes = Flags.getByValSize();
4827   } else
4828     return false;
4829 
4830   assert(FI != INT_MAX);
4831   if (!MFI.isFixedObjectIndex(FI))
4832     return false;
4833 
4834   if (Offset != MFI.getObjectOffset(FI))
4835     return false;
4836 
4837   // If this is not byval, check that the argument stack object is immutable.
4838   // inalloca and argument copy elision can create mutable argument stack
4839   // objects. Byval objects can be mutated, but a byval call intends to pass the
4840   // mutated memory.
4841   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4842     return false;
4843 
4844   if (VA.getLocVT().getFixedSizeInBits() >
4845       Arg.getValueSizeInBits().getFixedSize()) {
4846     // If the argument location is wider than the argument type, check that any
4847     // extension flags match.
4848     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4849         Flags.isSExt() != MFI.isObjectSExt(FI)) {
4850       return false;
4851     }
4852   }
4853 
4854   return Bytes == MFI.getObjectSize(FI);
4855 }
4856 
4857 /// Check whether the call is eligible for tail call optimization. Targets
4858 /// that want to do tail call optimization should implement this function.
4859 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4860     SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
4861     bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
4862     const SmallVectorImpl<SDValue> &OutVals,
4863     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4864   if (!mayTailCallThisCC(CalleeCC))
4865     return false;
4866 
4867   // If -tailcallopt is specified, make fastcc functions tail-callable.
4868   MachineFunction &MF = DAG.getMachineFunction();
4869   const Function &CallerF = MF.getFunction();
4870 
4871   // If the function return type is x86_fp80 and the callee return type is not,
4872   // then the FP_EXTEND of the call result is not a nop. It's not safe to
4873   // perform a tailcall optimization here.
4874   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4875     return false;
4876 
4877   CallingConv::ID CallerCC = CallerF.getCallingConv();
4878   bool CCMatch = CallerCC == CalleeCC;
4879   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4880   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4881   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4882       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4883 
4884   // Win64 functions have extra shadow space for argument homing. Don't do the
4885   // sibcall if the caller and callee have mismatched expectations for this
4886   // space.
4887   if (IsCalleeWin64 != IsCallerWin64)
4888     return false;
4889 
4890   if (IsGuaranteeTCO) {
4891     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4892       return true;
4893     return false;
4894   }
4895 
4896   // Look for obvious safe cases to perform tail call optimization that do not
4897   // require ABI changes. This is what gcc calls sibcall.
4898 
4899   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4900   // emit a special epilogue.
4901   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4902   if (RegInfo->hasStackRealignment(MF))
4903     return false;
4904 
4905   // Also avoid sibcall optimization if we're an sret return fn and the callee
4906   // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
4907   // insufficient.
4908   if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
4909     // For a compatible tail call the callee must return our sret pointer. So it
4910     // needs to be (a) an sret function itself and (b) we pass our sret as its
4911     // sret. Condition #b is harder to determine.
4912     return false;
4913   } else if (IsCalleePopSRet)
4914     // The callee pops an sret, so we cannot tail-call, as our caller doesn't
4915     // expect that.
4916     return false;
4917 
4918   // Do not sibcall optimize vararg calls unless all arguments are passed via
4919   // registers.
4920   LLVMContext &C = *DAG.getContext();
4921   if (isVarArg && !Outs.empty()) {
4922     // Optimizing for varargs on Win64 is unlikely to be safe without
4923     // additional testing.
4924     if (IsCalleeWin64 || IsCallerWin64)
4925       return false;
4926 
4927     SmallVector<CCValAssign, 16> ArgLocs;
4928     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4929 
4930     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4931     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4932       if (!ArgLocs[i].isRegLoc())
4933         return false;
4934   }
4935 
4936   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4937   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4938   // this into a sibcall.
4939   bool Unused = false;
4940   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4941     if (!Ins[i].Used) {
4942       Unused = true;
4943       break;
4944     }
4945   }
4946   if (Unused) {
4947     SmallVector<CCValAssign, 16> RVLocs;
4948     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4949     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4950     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4951       CCValAssign &VA = RVLocs[i];
4952       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4953         return false;
4954     }
4955   }
4956 
4957   // Check that the call results are passed in the same way.
4958   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4959                                   RetCC_X86, RetCC_X86))
4960     return false;
4961   // The callee has to preserve all registers the caller needs to preserve.
4962   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4963   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4964   if (!CCMatch) {
4965     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4966     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4967       return false;
4968   }
4969 
4970   unsigned StackArgsSize = 0;
4971 
4972   // If the callee takes no arguments then go on to check the results of the
4973   // call.
4974   if (!Outs.empty()) {
4975     // Check if stack adjustment is needed. For now, do not do this if any
4976     // argument is passed on the stack.
4977     SmallVector<CCValAssign, 16> ArgLocs;
4978     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4979 
4980     // Allocate shadow area for Win64
4981     if (IsCalleeWin64)
4982       CCInfo.AllocateStack(32, Align(8));
4983 
4984     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4985     StackArgsSize = CCInfo.getNextStackOffset();
4986 
4987     if (CCInfo.getNextStackOffset()) {
4988       // Check if the arguments are already laid out in the right way as
4989       // the caller's fixed stack objects.
4990       MachineFrameInfo &MFI = MF.getFrameInfo();
4991       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4992       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4993       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4994         CCValAssign &VA = ArgLocs[i];
4995         SDValue Arg = OutVals[i];
4996         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4997         if (VA.getLocInfo() == CCValAssign::Indirect)
4998           return false;
4999         if (!VA.isRegLoc()) {
5000           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5001                                    MFI, MRI, TII, VA))
5002             return false;
5003         }
5004       }
5005     }
5006 
5007     bool PositionIndependent = isPositionIndependent();
5008     // If the tailcall address may be in a register, then make sure it's
5009     // possible to register allocate for it. In 32-bit, the call address can
5010     // only target EAX, EDX, or ECX since the tail call must be scheduled after
5011     // callee-saved registers are restored. These happen to be the same
5012     // registers used to pass 'inreg' arguments so watch out for those.
5013     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5014                                   !isa<ExternalSymbolSDNode>(Callee)) ||
5015                                  PositionIndependent)) {
5016       unsigned NumInRegs = 0;
5017       // In PIC we need an extra register to formulate the address computation
5018       // for the callee.
5019       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5020 
5021       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5022         CCValAssign &VA = ArgLocs[i];
5023         if (!VA.isRegLoc())
5024           continue;
5025         Register Reg = VA.getLocReg();
5026         switch (Reg) {
5027         default: break;
5028         case X86::EAX: case X86::EDX: case X86::ECX:
5029           if (++NumInRegs == MaxInRegs)
5030             return false;
5031           break;
5032         }
5033       }
5034     }
5035 
5036     const MachineRegisterInfo &MRI = MF.getRegInfo();
5037     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5038       return false;
5039   }
5040 
5041   bool CalleeWillPop =
5042       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5043                        MF.getTarget().Options.GuaranteedTailCallOpt);
5044 
5045   if (unsigned BytesToPop =
5046           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5047     // If we have bytes to pop, the callee must pop them.
5048     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5049     if (!CalleePopMatches)
5050       return false;
5051   } else if (CalleeWillPop && StackArgsSize > 0) {
5052     // If we don't have bytes to pop, make sure the callee doesn't pop any.
5053     return false;
5054   }
5055 
5056   return true;
5057 }
5058 
5059 FastISel *
5060 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5061                                   const TargetLibraryInfo *libInfo) const {
5062   return X86::createFastISel(funcInfo, libInfo);
5063 }
5064 
5065 //===----------------------------------------------------------------------===//
5066 //                           Other Lowering Hooks
5067 //===----------------------------------------------------------------------===//
5068 
5069 bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5070                       bool AssumeSingleUse) {
5071   if (!AssumeSingleUse && !Op.hasOneUse())
5072     return false;
5073   if (!ISD::isNormalLoad(Op.getNode()))
5074     return false;
5075 
5076   // If this is an unaligned vector, make sure the target supports folding it.
5077   auto *Ld = cast<LoadSDNode>(Op.getNode());
5078   if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5079       Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
5080     return false;
5081 
5082   // TODO: If this is a non-temporal load and the target has an instruction
5083   //       for it, it should not be folded. See "useNonTemporalLoad()".
5084 
5085   return true;
5086 }
5087 
5088 bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5089                                           const X86Subtarget &Subtarget,
5090                                           bool AssumeSingleUse) {
5091   assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
5092   if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5093     return false;
5094 
5095   // We can not replace a wide volatile load with a broadcast-from-memory,
5096   // because that would narrow the load, which isn't legal for volatiles.
5097   auto *Ld = cast<LoadSDNode>(Op.getNode());
5098   return !Ld->isVolatile() ||
5099          Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5100 }
5101 
5102 bool X86::mayFoldIntoStore(SDValue Op) {
5103   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5104 }
5105 
5106 bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5107   if (Op.hasOneUse()) {
5108     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5109     return (ISD::ZERO_EXTEND == Opcode);
5110   }
5111   return false;
5112 }
5113 
5114 static bool isTargetShuffle(unsigned Opcode) {
5115   switch(Opcode) {
5116   default: return false;
5117   case X86ISD::BLENDI:
5118   case X86ISD::PSHUFB:
5119   case X86ISD::PSHUFD:
5120   case X86ISD::PSHUFHW:
5121   case X86ISD::PSHUFLW:
5122   case X86ISD::SHUFP:
5123   case X86ISD::INSERTPS:
5124   case X86ISD::EXTRQI:
5125   case X86ISD::INSERTQI:
5126   case X86ISD::VALIGN:
5127   case X86ISD::PALIGNR:
5128   case X86ISD::VSHLDQ:
5129   case X86ISD::VSRLDQ:
5130   case X86ISD::MOVLHPS:
5131   case X86ISD::MOVHLPS:
5132   case X86ISD::MOVSHDUP:
5133   case X86ISD::MOVSLDUP:
5134   case X86ISD::MOVDDUP:
5135   case X86ISD::MOVSS:
5136   case X86ISD::MOVSD:
5137   case X86ISD::MOVSH:
5138   case X86ISD::UNPCKL:
5139   case X86ISD::UNPCKH:
5140   case X86ISD::VBROADCAST:
5141   case X86ISD::VPERMILPI:
5142   case X86ISD::VPERMILPV:
5143   case X86ISD::VPERM2X128:
5144   case X86ISD::SHUF128:
5145   case X86ISD::VPERMIL2:
5146   case X86ISD::VPERMI:
5147   case X86ISD::VPPERM:
5148   case X86ISD::VPERMV:
5149   case X86ISD::VPERMV3:
5150   case X86ISD::VZEXT_MOVL:
5151     return true;
5152   }
5153 }
5154 
5155 static bool isTargetShuffleVariableMask(unsigned Opcode) {
5156   switch (Opcode) {
5157   default: return false;
5158   // Target Shuffles.
5159   case X86ISD::PSHUFB:
5160   case X86ISD::VPERMILPV:
5161   case X86ISD::VPERMIL2:
5162   case X86ISD::VPPERM:
5163   case X86ISD::VPERMV:
5164   case X86ISD::VPERMV3:
5165     return true;
5166   // 'Faux' Target Shuffles.
5167   case ISD::OR:
5168   case ISD::AND:
5169   case X86ISD::ANDNP:
5170     return true;
5171   }
5172 }
5173 
5174 static bool isTargetShuffleSplat(SDValue Op) {
5175   unsigned Opcode = Op.getOpcode();
5176   if (Opcode == ISD::EXTRACT_SUBVECTOR)
5177     return isTargetShuffleSplat(Op.getOperand(0));
5178   return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
5179 }
5180 
5181 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5182   MachineFunction &MF = DAG.getMachineFunction();
5183   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5184   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5185   int ReturnAddrIndex = FuncInfo->getRAIndex();
5186 
5187   if (ReturnAddrIndex == 0) {
5188     // Set up a frame object for the return address.
5189     unsigned SlotSize = RegInfo->getSlotSize();
5190     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5191                                                           -(int64_t)SlotSize,
5192                                                           false);
5193     FuncInfo->setRAIndex(ReturnAddrIndex);
5194   }
5195 
5196   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5197 }
5198 
5199 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5200                                        bool hasSymbolicDisplacement) {
5201   // Offset should fit into 32 bit immediate field.
5202   if (!isInt<32>(Offset))
5203     return false;
5204 
5205   // If we don't have a symbolic displacement - we don't have any extra
5206   // restrictions.
5207   if (!hasSymbolicDisplacement)
5208     return true;
5209 
5210   // FIXME: Some tweaks might be needed for medium code model.
5211   if (M != CodeModel::Small && M != CodeModel::Kernel)
5212     return false;
5213 
5214   // For small code model we assume that latest object is 16MB before end of 31
5215   // bits boundary. We may also accept pretty large negative constants knowing
5216   // that all objects are in the positive half of address space.
5217   if (M == CodeModel::Small && Offset < 16*1024*1024)
5218     return true;
5219 
5220   // For kernel code model we know that all object resist in the negative half
5221   // of 32bits address space. We may not accept negative offsets, since they may
5222   // be just off and we may accept pretty large positive ones.
5223   if (M == CodeModel::Kernel && Offset >= 0)
5224     return true;
5225 
5226   return false;
5227 }
5228 
5229 /// Determines whether the callee is required to pop its own arguments.
5230 /// Callee pop is necessary to support tail calls.
5231 bool X86::isCalleePop(CallingConv::ID CallingConv,
5232                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5233   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5234   // can guarantee TCO.
5235   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5236     return true;
5237 
5238   switch (CallingConv) {
5239   default:
5240     return false;
5241   case CallingConv::X86_StdCall:
5242   case CallingConv::X86_FastCall:
5243   case CallingConv::X86_ThisCall:
5244   case CallingConv::X86_VectorCall:
5245     return !is64Bit;
5246   }
5247 }
5248 
5249 /// Return true if the condition is an signed comparison operation.
5250 static bool isX86CCSigned(unsigned X86CC) {
5251   switch (X86CC) {
5252   default:
5253     llvm_unreachable("Invalid integer condition!");
5254   case X86::COND_E:
5255   case X86::COND_NE:
5256   case X86::COND_B:
5257   case X86::COND_A:
5258   case X86::COND_BE:
5259   case X86::COND_AE:
5260     return false;
5261   case X86::COND_G:
5262   case X86::COND_GE:
5263   case X86::COND_L:
5264   case X86::COND_LE:
5265     return true;
5266   }
5267 }
5268 
5269 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5270   switch (SetCCOpcode) {
5271   default: llvm_unreachable("Invalid integer condition!");
5272   case ISD::SETEQ:  return X86::COND_E;
5273   case ISD::SETGT:  return X86::COND_G;
5274   case ISD::SETGE:  return X86::COND_GE;
5275   case ISD::SETLT:  return X86::COND_L;
5276   case ISD::SETLE:  return X86::COND_LE;
5277   case ISD::SETNE:  return X86::COND_NE;
5278   case ISD::SETULT: return X86::COND_B;
5279   case ISD::SETUGT: return X86::COND_A;
5280   case ISD::SETULE: return X86::COND_BE;
5281   case ISD::SETUGE: return X86::COND_AE;
5282   }
5283 }
5284 
5285 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5286 /// condition code, returning the condition code and the LHS/RHS of the
5287 /// comparison to make.
5288 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5289                                     bool isFP, SDValue &LHS, SDValue &RHS,
5290                                     SelectionDAG &DAG) {
5291   if (!isFP) {
5292     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5293       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5294         // X > -1   -> X == 0, jump !sign.
5295         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5296         return X86::COND_NS;
5297       }
5298       if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5299         // X < 0   -> X == 0, jump on sign.
5300         return X86::COND_S;
5301       }
5302       if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5303         // X >= 0   -> X == 0, jump on !sign.
5304         return X86::COND_NS;
5305       }
5306       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5307         // X < 1   -> X <= 0
5308         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5309         return X86::COND_LE;
5310       }
5311     }
5312 
5313     return TranslateIntegerX86CC(SetCCOpcode);
5314   }
5315 
5316   // First determine if it is required or is profitable to flip the operands.
5317 
5318   // If LHS is a foldable load, but RHS is not, flip the condition.
5319   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5320       !ISD::isNON_EXTLoad(RHS.getNode())) {
5321     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5322     std::swap(LHS, RHS);
5323   }
5324 
5325   switch (SetCCOpcode) {
5326   default: break;
5327   case ISD::SETOLT:
5328   case ISD::SETOLE:
5329   case ISD::SETUGT:
5330   case ISD::SETUGE:
5331     std::swap(LHS, RHS);
5332     break;
5333   }
5334 
5335   // On a floating point condition, the flags are set as follows:
5336   // ZF  PF  CF   op
5337   //  0 | 0 | 0 | X > Y
5338   //  0 | 0 | 1 | X < Y
5339   //  1 | 0 | 0 | X == Y
5340   //  1 | 1 | 1 | unordered
5341   switch (SetCCOpcode) {
5342   default: llvm_unreachable("Condcode should be pre-legalized away");
5343   case ISD::SETUEQ:
5344   case ISD::SETEQ:   return X86::COND_E;
5345   case ISD::SETOLT:              // flipped
5346   case ISD::SETOGT:
5347   case ISD::SETGT:   return X86::COND_A;
5348   case ISD::SETOLE:              // flipped
5349   case ISD::SETOGE:
5350   case ISD::SETGE:   return X86::COND_AE;
5351   case ISD::SETUGT:              // flipped
5352   case ISD::SETULT:
5353   case ISD::SETLT:   return X86::COND_B;
5354   case ISD::SETUGE:              // flipped
5355   case ISD::SETULE:
5356   case ISD::SETLE:   return X86::COND_BE;
5357   case ISD::SETONE:
5358   case ISD::SETNE:   return X86::COND_NE;
5359   case ISD::SETUO:   return X86::COND_P;
5360   case ISD::SETO:    return X86::COND_NP;
5361   case ISD::SETOEQ:
5362   case ISD::SETUNE:  return X86::COND_INVALID;
5363   }
5364 }
5365 
5366 /// Is there a floating point cmov for the specific X86 condition code?
5367 /// Current x86 isa includes the following FP cmov instructions:
5368 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5369 static bool hasFPCMov(unsigned X86CC) {
5370   switch (X86CC) {
5371   default:
5372     return false;
5373   case X86::COND_B:
5374   case X86::COND_BE:
5375   case X86::COND_E:
5376   case X86::COND_P:
5377   case X86::COND_A:
5378   case X86::COND_AE:
5379   case X86::COND_NE:
5380   case X86::COND_NP:
5381     return true;
5382   }
5383 }
5384 
5385 static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5386   return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5387          VT.is512BitVector();
5388 }
5389 
5390 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5391                                            const CallInst &I,
5392                                            MachineFunction &MF,
5393                                            unsigned Intrinsic) const {
5394   Info.flags = MachineMemOperand::MONone;
5395   Info.offset = 0;
5396 
5397   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5398   if (!IntrData) {
5399     switch (Intrinsic) {
5400     case Intrinsic::x86_aesenc128kl:
5401     case Intrinsic::x86_aesdec128kl:
5402       Info.opc = ISD::INTRINSIC_W_CHAIN;
5403       Info.ptrVal = I.getArgOperand(1);
5404       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5405       Info.align = Align(1);
5406       Info.flags |= MachineMemOperand::MOLoad;
5407       return true;
5408     case Intrinsic::x86_aesenc256kl:
5409     case Intrinsic::x86_aesdec256kl:
5410       Info.opc = ISD::INTRINSIC_W_CHAIN;
5411       Info.ptrVal = I.getArgOperand(1);
5412       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5413       Info.align = Align(1);
5414       Info.flags |= MachineMemOperand::MOLoad;
5415       return true;
5416     case Intrinsic::x86_aesencwide128kl:
5417     case Intrinsic::x86_aesdecwide128kl:
5418       Info.opc = ISD::INTRINSIC_W_CHAIN;
5419       Info.ptrVal = I.getArgOperand(0);
5420       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5421       Info.align = Align(1);
5422       Info.flags |= MachineMemOperand::MOLoad;
5423       return true;
5424     case Intrinsic::x86_aesencwide256kl:
5425     case Intrinsic::x86_aesdecwide256kl:
5426       Info.opc = ISD::INTRINSIC_W_CHAIN;
5427       Info.ptrVal = I.getArgOperand(0);
5428       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5429       Info.align = Align(1);
5430       Info.flags |= MachineMemOperand::MOLoad;
5431       return true;
5432     }
5433     return false;
5434   }
5435 
5436   switch (IntrData->Type) {
5437   case TRUNCATE_TO_MEM_VI8:
5438   case TRUNCATE_TO_MEM_VI16:
5439   case TRUNCATE_TO_MEM_VI32: {
5440     Info.opc = ISD::INTRINSIC_VOID;
5441     Info.ptrVal = I.getArgOperand(0);
5442     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
5443     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5444     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5445       ScalarVT = MVT::i8;
5446     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5447       ScalarVT = MVT::i16;
5448     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5449       ScalarVT = MVT::i32;
5450 
5451     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5452     Info.align = Align(1);
5453     Info.flags |= MachineMemOperand::MOStore;
5454     break;
5455   }
5456   case GATHER:
5457   case GATHER_AVX2: {
5458     Info.opc = ISD::INTRINSIC_W_CHAIN;
5459     Info.ptrVal = nullptr;
5460     MVT DataVT = MVT::getVT(I.getType());
5461     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5462     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5463                                 IndexVT.getVectorNumElements());
5464     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5465     Info.align = Align(1);
5466     Info.flags |= MachineMemOperand::MOLoad;
5467     break;
5468   }
5469   case SCATTER: {
5470     Info.opc = ISD::INTRINSIC_VOID;
5471     Info.ptrVal = nullptr;
5472     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5473     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5474     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5475                                 IndexVT.getVectorNumElements());
5476     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5477     Info.align = Align(1);
5478     Info.flags |= MachineMemOperand::MOStore;
5479     break;
5480   }
5481   default:
5482     return false;
5483   }
5484 
5485   return true;
5486 }
5487 
5488 /// Returns true if the target can instruction select the
5489 /// specified FP immediate natively. If false, the legalizer will
5490 /// materialize the FP immediate as a load from a constant pool.
5491 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5492                                      bool ForCodeSize) const {
5493   for (const APFloat &FPImm : LegalFPImmediates)
5494     if (Imm.bitwiseIsEqual(FPImm))
5495       return true;
5496   return false;
5497 }
5498 
5499 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5500                                               ISD::LoadExtType ExtTy,
5501                                               EVT NewVT) const {
5502   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
5503 
5504   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5505   // relocation target a movq or addq instruction: don't let the load shrink.
5506   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5507   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5508     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5509       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5510 
5511   // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5512   // those uses are extracted directly into a store, then the extract + store
5513   // can be store-folded. Therefore, it's probably not worth splitting the load.
5514   EVT VT = Load->getValueType(0);
5515   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5516     for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5517       // Skip uses of the chain value. Result 0 of the node is the load value.
5518       if (UI.getUse().getResNo() != 0)
5519         continue;
5520 
5521       // If this use is not an extract + store, it's probably worth splitting.
5522       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5523           UI->use_begin()->getOpcode() != ISD::STORE)
5524         return true;
5525     }
5526     // All non-chain uses are extract + store.
5527     return false;
5528   }
5529 
5530   return true;
5531 }
5532 
5533 /// Returns true if it is beneficial to convert a load of a constant
5534 /// to just the constant itself.
5535 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5536                                                           Type *Ty) const {
5537   assert(Ty->isIntegerTy());
5538 
5539   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5540   if (BitSize == 0 || BitSize > 64)
5541     return false;
5542   return true;
5543 }
5544 
5545 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5546   // If we are using XMM registers in the ABI and the condition of the select is
5547   // a floating-point compare and we have blendv or conditional move, then it is
5548   // cheaper to select instead of doing a cross-register move and creating a
5549   // load that depends on the compare result.
5550   bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5551   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5552 }
5553 
5554 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5555   // TODO: It might be a win to ease or lift this restriction, but the generic
5556   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5557   if (VT.isVector() && Subtarget.hasAVX512())
5558     return false;
5559 
5560   return true;
5561 }
5562 
5563 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5564                                                SDValue C) const {
5565   // TODO: We handle scalars using custom code, but generic combining could make
5566   // that unnecessary.
5567   APInt MulC;
5568   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5569     return false;
5570 
5571   // Find the type this will be legalized too. Otherwise we might prematurely
5572   // convert this to shl+add/sub and then still have to type legalize those ops.
5573   // Another choice would be to defer the decision for illegal types until
5574   // after type legalization. But constant splat vectors of i64 can't make it
5575   // through type legalization on 32-bit targets so we would need to special
5576   // case vXi64.
5577   while (getTypeAction(Context, VT) != TypeLegal)
5578     VT = getTypeToTransformTo(Context, VT);
5579 
5580   // If vector multiply is legal, assume that's faster than shl + add/sub.
5581   // Multiply is a complex op with higher latency and lower throughput in
5582   // most implementations, sub-vXi32 vector multiplies are always fast,
5583   // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5584   // is always going to be slow.
5585   unsigned EltSizeInBits = VT.getScalarSizeInBits();
5586   if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5587       (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5588     return false;
5589 
5590   // shl+add, shl+sub, shl+add+neg
5591   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5592          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5593 }
5594 
5595 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5596                                                 unsigned Index) const {
5597   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5598     return false;
5599 
5600   // Mask vectors support all subregister combinations and operations that
5601   // extract half of vector.
5602   if (ResVT.getVectorElementType() == MVT::i1)
5603     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5604                           (Index == ResVT.getVectorNumElements()));
5605 
5606   return (Index % ResVT.getVectorNumElements()) == 0;
5607 }
5608 
5609 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5610   unsigned Opc = VecOp.getOpcode();
5611 
5612   // Assume target opcodes can't be scalarized.
5613   // TODO - do we have any exceptions?
5614   if (Opc >= ISD::BUILTIN_OP_END)
5615     return false;
5616 
5617   // If the vector op is not supported, try to convert to scalar.
5618   EVT VecVT = VecOp.getValueType();
5619   if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5620     return true;
5621 
5622   // If the vector op is supported, but the scalar op is not, the transform may
5623   // not be worthwhile.
5624   EVT ScalarVT = VecVT.getScalarType();
5625   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5626 }
5627 
5628 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5629                                              bool) const {
5630   // TODO: Allow vectors?
5631   if (VT.isVector())
5632     return false;
5633   return VT.isSimple() || !isOperationExpand(Opcode, VT);
5634 }
5635 
5636 bool X86TargetLowering::isCheapToSpeculateCttz() const {
5637   // Speculate cttz only if we can directly use TZCNT.
5638   return Subtarget.hasBMI();
5639 }
5640 
5641 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5642   // Speculate ctlz only if we can directly use LZCNT.
5643   return Subtarget.hasLZCNT();
5644 }
5645 
5646 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5647                                                 const SelectionDAG &DAG,
5648                                                 const MachineMemOperand &MMO) const {
5649   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5650       BitcastVT.getVectorElementType() == MVT::i1)
5651     return false;
5652 
5653   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5654     return false;
5655 
5656   // If both types are legal vectors, it's always ok to convert them.
5657   if (LoadVT.isVector() && BitcastVT.isVector() &&
5658       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5659     return true;
5660 
5661   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5662 }
5663 
5664 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5665                                          const MachineFunction &MF) const {
5666   // Do not merge to float value size (128 bytes) if no implicit
5667   // float attribute is set.
5668   bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
5669 
5670   if (NoFloat) {
5671     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5672     return (MemVT.getSizeInBits() <= MaxIntSize);
5673   }
5674   // Make sure we don't merge greater than our preferred vector
5675   // width.
5676   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5677     return false;
5678 
5679   return true;
5680 }
5681 
5682 bool X86TargetLowering::isCtlzFast() const {
5683   return Subtarget.hasFastLZCNT();
5684 }
5685 
5686 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5687     const Instruction &AndI) const {
5688   return true;
5689 }
5690 
5691 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5692   EVT VT = Y.getValueType();
5693 
5694   if (VT.isVector())
5695     return false;
5696 
5697   if (!Subtarget.hasBMI())
5698     return false;
5699 
5700   // There are only 32-bit and 64-bit forms for 'andn'.
5701   if (VT != MVT::i32 && VT != MVT::i64)
5702     return false;
5703 
5704   return !isa<ConstantSDNode>(Y);
5705 }
5706 
5707 bool X86TargetLowering::hasAndNot(SDValue Y) const {
5708   EVT VT = Y.getValueType();
5709 
5710   if (!VT.isVector())
5711     return hasAndNotCompare(Y);
5712 
5713   // Vector.
5714 
5715   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5716     return false;
5717 
5718   if (VT == MVT::v4i32)
5719     return true;
5720 
5721   return Subtarget.hasSSE2();
5722 }
5723 
5724 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5725   return X.getValueType().isScalarInteger(); // 'bt'
5726 }
5727 
5728 bool X86TargetLowering::
5729     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5730         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5731         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5732         SelectionDAG &DAG) const {
5733   // Does baseline recommend not to perform the fold by default?
5734   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5735           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5736     return false;
5737   // For scalars this transform is always beneficial.
5738   if (X.getValueType().isScalarInteger())
5739     return true;
5740   // If all the shift amounts are identical, then transform is beneficial even
5741   // with rudimentary SSE2 shifts.
5742   if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5743     return true;
5744   // If we have AVX2 with it's powerful shift operations, then it's also good.
5745   if (Subtarget.hasAVX2())
5746     return true;
5747   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5748   return NewShiftOpcode == ISD::SHL;
5749 }
5750 
5751 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5752     const SDNode *N, CombineLevel Level) const {
5753   assert(((N->getOpcode() == ISD::SHL &&
5754            N->getOperand(0).getOpcode() == ISD::SRL) ||
5755           (N->getOpcode() == ISD::SRL &&
5756            N->getOperand(0).getOpcode() == ISD::SHL)) &&
5757          "Expected shift-shift mask");
5758   EVT VT = N->getValueType(0);
5759   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5760       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5761     // Only fold if the shift values are equal - so it folds to AND.
5762     // TODO - we should fold if either is a non-uniform vector but we don't do
5763     // the fold for non-splats yet.
5764     return N->getOperand(1) == N->getOperand(0).getOperand(1);
5765   }
5766   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5767 }
5768 
5769 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5770   EVT VT = Y.getValueType();
5771 
5772   // For vectors, we don't have a preference, but we probably want a mask.
5773   if (VT.isVector())
5774     return false;
5775 
5776   // 64-bit shifts on 32-bit targets produce really bad bloated code.
5777   if (VT == MVT::i64 && !Subtarget.is64Bit())
5778     return false;
5779 
5780   return true;
5781 }
5782 
5783 bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5784                                           SDNode *N) const {
5785   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5786       !Subtarget.isOSWindows())
5787     return false;
5788   return true;
5789 }
5790 
5791 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5792   // Any legal vector type can be splatted more efficiently than
5793   // loading/spilling from memory.
5794   return isTypeLegal(VT);
5795 }
5796 
5797 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5798   MVT VT = MVT::getIntegerVT(NumBits);
5799   if (isTypeLegal(VT))
5800     return VT;
5801 
5802   // PMOVMSKB can handle this.
5803   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5804     return MVT::v16i8;
5805 
5806   // VPMOVMSKB can handle this.
5807   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5808     return MVT::v32i8;
5809 
5810   // TODO: Allow 64-bit type for 32-bit target.
5811   // TODO: 512-bit types should be allowed, but make sure that those
5812   // cases are handled in combineVectorSizedSetCCEquality().
5813 
5814   return MVT::INVALID_SIMPLE_VALUE_TYPE;
5815 }
5816 
5817 /// Val is the undef sentinel value or equal to the specified value.
5818 static bool isUndefOrEqual(int Val, int CmpVal) {
5819   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5820 }
5821 
5822 /// Return true if every element in Mask is the undef sentinel value or equal to
5823 /// the specified value..
5824 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5825   return llvm::all_of(Mask, [CmpVal](int M) {
5826     return (M == SM_SentinelUndef) || (M == CmpVal);
5827   });
5828 }
5829 
5830 /// Val is either the undef or zero sentinel value.
5831 static bool isUndefOrZero(int Val) {
5832   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5833 }
5834 
5835 /// Return true if every element in Mask, beginning from position Pos and ending
5836 /// in Pos+Size is the undef sentinel value.
5837 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5838   return llvm::all_of(Mask.slice(Pos, Size),
5839                       [](int M) { return M == SM_SentinelUndef; });
5840 }
5841 
5842 /// Return true if the mask creates a vector whose lower half is undefined.
5843 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5844   unsigned NumElts = Mask.size();
5845   return isUndefInRange(Mask, 0, NumElts / 2);
5846 }
5847 
5848 /// Return true if the mask creates a vector whose upper half is undefined.
5849 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5850   unsigned NumElts = Mask.size();
5851   return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5852 }
5853 
5854 /// Return true if Val falls within the specified range (L, H].
5855 static bool isInRange(int Val, int Low, int Hi) {
5856   return (Val >= Low && Val < Hi);
5857 }
5858 
5859 /// Return true if the value of any element in Mask falls within the specified
5860 /// range (L, H].
5861 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5862   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5863 }
5864 
5865 /// Return true if the value of any element in Mask is the zero sentinel value.
5866 static bool isAnyZero(ArrayRef<int> Mask) {
5867   return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5868 }
5869 
5870 /// Return true if the value of any element in Mask is the zero or undef
5871 /// sentinel values.
5872 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5873   return llvm::any_of(Mask, [](int M) {
5874     return M == SM_SentinelZero || M == SM_SentinelUndef;
5875   });
5876 }
5877 
5878 /// Return true if Val is undef or if its value falls within the
5879 /// specified range (L, H].
5880 static bool isUndefOrInRange(int Val, int Low, int Hi) {
5881   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5882 }
5883 
5884 /// Return true if every element in Mask is undef or if its value
5885 /// falls within the specified range (L, H].
5886 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5887   return llvm::all_of(
5888       Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5889 }
5890 
5891 /// Return true if Val is undef, zero or if its value falls within the
5892 /// specified range (L, H].
5893 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5894   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5895 }
5896 
5897 /// Return true if every element in Mask is undef, zero or if its value
5898 /// falls within the specified range (L, H].
5899 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5900   return llvm::all_of(
5901       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5902 }
5903 
5904 /// Return true if every element in Mask, beginning
5905 /// from position Pos and ending in Pos + Size, falls within the specified
5906 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5907 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5908                                        unsigned Size, int Low, int Step = 1) {
5909   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5910     if (!isUndefOrEqual(Mask[i], Low))
5911       return false;
5912   return true;
5913 }
5914 
5915 /// Return true if every element in Mask, beginning
5916 /// from position Pos and ending in Pos+Size, falls within the specified
5917 /// sequential range (Low, Low+Size], or is undef or is zero.
5918 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5919                                              unsigned Size, int Low,
5920                                              int Step = 1) {
5921   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5922     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5923       return false;
5924   return true;
5925 }
5926 
5927 /// Return true if every element in Mask, beginning
5928 /// from position Pos and ending in Pos+Size is undef or is zero.
5929 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5930                                  unsigned Size) {
5931   return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
5932 }
5933 
5934 /// Helper function to test whether a shuffle mask could be
5935 /// simplified by widening the elements being shuffled.
5936 ///
5937 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5938 /// leaves it in an unspecified state.
5939 ///
5940 /// NOTE: This must handle normal vector shuffle masks and *target* vector
5941 /// shuffle masks. The latter have the special property of a '-2' representing
5942 /// a zero-ed lane of a vector.
5943 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5944                                     SmallVectorImpl<int> &WidenedMask) {
5945   WidenedMask.assign(Mask.size() / 2, 0);
5946   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5947     int M0 = Mask[i];
5948     int M1 = Mask[i + 1];
5949 
5950     // If both elements are undef, its trivial.
5951     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5952       WidenedMask[i / 2] = SM_SentinelUndef;
5953       continue;
5954     }
5955 
5956     // Check for an undef mask and a mask value properly aligned to fit with
5957     // a pair of values. If we find such a case, use the non-undef mask's value.
5958     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5959       WidenedMask[i / 2] = M1 / 2;
5960       continue;
5961     }
5962     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5963       WidenedMask[i / 2] = M0 / 2;
5964       continue;
5965     }
5966 
5967     // When zeroing, we need to spread the zeroing across both lanes to widen.
5968     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5969       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5970           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5971         WidenedMask[i / 2] = SM_SentinelZero;
5972         continue;
5973       }
5974       return false;
5975     }
5976 
5977     // Finally check if the two mask values are adjacent and aligned with
5978     // a pair.
5979     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5980       WidenedMask[i / 2] = M0 / 2;
5981       continue;
5982     }
5983 
5984     // Otherwise we can't safely widen the elements used in this shuffle.
5985     return false;
5986   }
5987   assert(WidenedMask.size() == Mask.size() / 2 &&
5988          "Incorrect size of mask after widening the elements!");
5989 
5990   return true;
5991 }
5992 
5993 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5994                                     const APInt &Zeroable,
5995                                     bool V2IsZero,
5996                                     SmallVectorImpl<int> &WidenedMask) {
5997   // Create an alternative mask with info about zeroable elements.
5998   // Here we do not set undef elements as zeroable.
5999   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
6000   if (V2IsZero) {
6001     assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
6002     for (int i = 0, Size = Mask.size(); i != Size; ++i)
6003       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6004         ZeroableMask[i] = SM_SentinelZero;
6005   }
6006   return canWidenShuffleElements(ZeroableMask, WidenedMask);
6007 }
6008 
6009 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6010   SmallVector<int, 32> WidenedMask;
6011   return canWidenShuffleElements(Mask, WidenedMask);
6012 }
6013 
6014 // Attempt to narrow/widen shuffle mask until it matches the target number of
6015 // elements.
6016 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6017                                  SmallVectorImpl<int> &ScaledMask) {
6018   unsigned NumSrcElts = Mask.size();
6019   assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
6020          "Illegal shuffle scale factor");
6021 
6022   // Narrowing is guaranteed to work.
6023   if (NumDstElts >= NumSrcElts) {
6024     int Scale = NumDstElts / NumSrcElts;
6025     llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6026     return true;
6027   }
6028 
6029   // We have to repeat the widening until we reach the target size, but we can
6030   // split out the first widening as it sets up ScaledMask for us.
6031   if (canWidenShuffleElements(Mask, ScaledMask)) {
6032     while (ScaledMask.size() > NumDstElts) {
6033       SmallVector<int, 16> WidenedMask;
6034       if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6035         return false;
6036       ScaledMask = std::move(WidenedMask);
6037     }
6038     return true;
6039   }
6040 
6041   return false;
6042 }
6043 
6044 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
6045 bool X86::isZeroNode(SDValue Elt) {
6046   return isNullConstant(Elt) || isNullFPConstant(Elt);
6047 }
6048 
6049 // Build a vector of constants.
6050 // Use an UNDEF node if MaskElt == -1.
6051 // Split 64-bit constants in the 32-bit mode.
6052 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6053                               const SDLoc &dl, bool IsMask = false) {
6054 
6055   SmallVector<SDValue, 32>  Ops;
6056   bool Split = false;
6057 
6058   MVT ConstVecVT = VT;
6059   unsigned NumElts = VT.getVectorNumElements();
6060   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6061   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6062     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6063     Split = true;
6064   }
6065 
6066   MVT EltVT = ConstVecVT.getVectorElementType();
6067   for (unsigned i = 0; i < NumElts; ++i) {
6068     bool IsUndef = Values[i] < 0 && IsMask;
6069     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6070       DAG.getConstant(Values[i], dl, EltVT);
6071     Ops.push_back(OpNode);
6072     if (Split)
6073       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6074                     DAG.getConstant(0, dl, EltVT));
6075   }
6076   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6077   if (Split)
6078     ConstsNode = DAG.getBitcast(VT, ConstsNode);
6079   return ConstsNode;
6080 }
6081 
6082 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6083                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6084   assert(Bits.size() == Undefs.getBitWidth() &&
6085          "Unequal constant and undef arrays");
6086   SmallVector<SDValue, 32> Ops;
6087   bool Split = false;
6088 
6089   MVT ConstVecVT = VT;
6090   unsigned NumElts = VT.getVectorNumElements();
6091   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6092   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6093     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6094     Split = true;
6095   }
6096 
6097   MVT EltVT = ConstVecVT.getVectorElementType();
6098   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6099     if (Undefs[i]) {
6100       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6101       continue;
6102     }
6103     const APInt &V = Bits[i];
6104     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
6105     if (Split) {
6106       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6107       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6108     } else if (EltVT == MVT::f32) {
6109       APFloat FV(APFloat::IEEEsingle(), V);
6110       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6111     } else if (EltVT == MVT::f64) {
6112       APFloat FV(APFloat::IEEEdouble(), V);
6113       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6114     } else {
6115       Ops.push_back(DAG.getConstant(V, dl, EltVT));
6116     }
6117   }
6118 
6119   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6120   return DAG.getBitcast(VT, ConstsNode);
6121 }
6122 
6123 /// Returns a vector of specified type with all zero elements.
6124 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6125                              SelectionDAG &DAG, const SDLoc &dl) {
6126   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
6127           VT.getVectorElementType() == MVT::i1) &&
6128          "Unexpected vector type");
6129 
6130   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6131   // type. This ensures they get CSE'd. But if the integer type is not
6132   // available, use a floating-point +0.0 instead.
6133   SDValue Vec;
6134   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6135     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6136   } else if (VT.isFloatingPoint()) {
6137     Vec = DAG.getConstantFP(+0.0, dl, VT);
6138   } else if (VT.getVectorElementType() == MVT::i1) {
6139     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
6140            "Unexpected vector type");
6141     Vec = DAG.getConstant(0, dl, VT);
6142   } else {
6143     unsigned Num32BitElts = VT.getSizeInBits() / 32;
6144     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6145   }
6146   return DAG.getBitcast(VT, Vec);
6147 }
6148 
6149 // Helper to determine if the ops are all the extracted subvectors come from a
6150 // single source. If we allow commute they don't have to be in order (Lo/Hi).
6151 static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6152   if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6153       RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6154       LHS.getValueType() != RHS.getValueType() ||
6155       LHS.getOperand(0) != RHS.getOperand(0))
6156     return SDValue();
6157 
6158   SDValue Src = LHS.getOperand(0);
6159   if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6160     return SDValue();
6161 
6162   unsigned NumElts = LHS.getValueType().getVectorNumElements();
6163   if ((LHS.getConstantOperandAPInt(1) == 0 &&
6164        RHS.getConstantOperandAPInt(1) == NumElts) ||
6165       (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6166        LHS.getConstantOperandAPInt(1) == NumElts))
6167     return Src;
6168 
6169   return SDValue();
6170 }
6171 
6172 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6173                                 const SDLoc &dl, unsigned vectorWidth) {
6174   EVT VT = Vec.getValueType();
6175   EVT ElVT = VT.getVectorElementType();
6176   unsigned Factor = VT.getSizeInBits() / vectorWidth;
6177   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6178                                   VT.getVectorNumElements() / Factor);
6179 
6180   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
6181   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6182   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
6183 
6184   // This is the index of the first element of the vectorWidth-bit chunk
6185   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6186   IdxVal &= ~(ElemsPerChunk - 1);
6187 
6188   // If the input is a buildvector just emit a smaller one.
6189   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6190     return DAG.getBuildVector(ResultVT, dl,
6191                               Vec->ops().slice(IdxVal, ElemsPerChunk));
6192 
6193   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6194   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6195 }
6196 
6197 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
6198 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6199 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6200 /// instructions or a simple subregister reference. Idx is an index in the
6201 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
6202 /// lowering EXTRACT_VECTOR_ELT operations easier.
6203 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6204                                    SelectionDAG &DAG, const SDLoc &dl) {
6205   assert((Vec.getValueType().is256BitVector() ||
6206           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
6207   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6208 }
6209 
6210 /// Generate a DAG to grab 256-bits from a 512-bit vector.
6211 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6212                                    SelectionDAG &DAG, const SDLoc &dl) {
6213   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
6214   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6215 }
6216 
6217 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6218                                SelectionDAG &DAG, const SDLoc &dl,
6219                                unsigned vectorWidth) {
6220   assert((vectorWidth == 128 || vectorWidth == 256) &&
6221          "Unsupported vector width");
6222   // Inserting UNDEF is Result
6223   if (Vec.isUndef())
6224     return Result;
6225   EVT VT = Vec.getValueType();
6226   EVT ElVT = VT.getVectorElementType();
6227   EVT ResultVT = Result.getValueType();
6228 
6229   // Insert the relevant vectorWidth bits.
6230   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6231   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
6232 
6233   // This is the index of the first element of the vectorWidth-bit chunk
6234   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6235   IdxVal &= ~(ElemsPerChunk - 1);
6236 
6237   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6238   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6239 }
6240 
6241 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
6242 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6243 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6244 /// simple superregister reference.  Idx is an index in the 128 bits
6245 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
6246 /// lowering INSERT_VECTOR_ELT operations easier.
6247 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6248                                   SelectionDAG &DAG, const SDLoc &dl) {
6249   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
6250   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6251 }
6252 
6253 /// Widen a vector to a larger size with the same scalar type, with the new
6254 /// elements either zero or undef.
6255 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6256                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
6257                               const SDLoc &dl) {
6258   assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
6259          Vec.getValueType().getScalarType() == VT.getScalarType() &&
6260          "Unsupported vector widening type");
6261   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6262                                 : DAG.getUNDEF(VT);
6263   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6264                      DAG.getIntPtrConstant(0, dl));
6265 }
6266 
6267 /// Widen a vector to a larger size with the same scalar type, with the new
6268 /// elements either zero or undef.
6269 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6270                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
6271                               const SDLoc &dl, unsigned WideSizeInBits) {
6272   assert(Vec.getValueSizeInBits() < WideSizeInBits &&
6273          (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
6274          "Unsupported vector widening type");
6275   unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6276   MVT SVT = Vec.getSimpleValueType().getScalarType();
6277   MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6278   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6279 }
6280 
6281 // Helper function to collect subvector ops that are concatenated together,
6282 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6283 // The subvectors in Ops are guaranteed to be the same type.
6284 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
6285   assert(Ops.empty() && "Expected an empty ops vector");
6286 
6287   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6288     Ops.append(N->op_begin(), N->op_end());
6289     return true;
6290   }
6291 
6292   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6293     SDValue Src = N->getOperand(0);
6294     SDValue Sub = N->getOperand(1);
6295     const APInt &Idx = N->getConstantOperandAPInt(2);
6296     EVT VT = Src.getValueType();
6297     EVT SubVT = Sub.getValueType();
6298 
6299     // TODO - Handle more general insert_subvector chains.
6300     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6301         Idx == (VT.getVectorNumElements() / 2)) {
6302       // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6303       if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6304           Src.getOperand(1).getValueType() == SubVT &&
6305           isNullConstant(Src.getOperand(2))) {
6306         Ops.push_back(Src.getOperand(1));
6307         Ops.push_back(Sub);
6308         return true;
6309       }
6310       // insert_subvector(x, extract_subvector(x, lo), hi)
6311       if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6312           Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6313         Ops.append(2, Sub);
6314         return true;
6315       }
6316     }
6317   }
6318 
6319   return false;
6320 }
6321 
6322 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6323                                                const SDLoc &dl) {
6324   EVT VT = Op.getValueType();
6325   unsigned NumElems = VT.getVectorNumElements();
6326   unsigned SizeInBits = VT.getSizeInBits();
6327   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
6328          "Can't split odd sized vector");
6329 
6330   // If this is a splat value (with no-undefs) then use the lower subvector,
6331   // which should be a free extraction.
6332   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6333   if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6334     return std::make_pair(Lo, Lo);
6335 
6336   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6337   return std::make_pair(Lo, Hi);
6338 }
6339 
6340 /// Break an operation into 2 half sized ops and then concatenate the results.
6341 static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6342   unsigned NumOps = Op.getNumOperands();
6343   EVT VT = Op.getValueType();
6344   SDLoc dl(Op);
6345 
6346   // Extract the LHS Lo/Hi vectors
6347   SmallVector<SDValue> LoOps(NumOps, SDValue());
6348   SmallVector<SDValue> HiOps(NumOps, SDValue());
6349   for (unsigned I = 0; I != NumOps; ++I) {
6350     SDValue SrcOp = Op.getOperand(I);
6351     if (!SrcOp.getValueType().isVector()) {
6352       LoOps[I] = HiOps[I] = SrcOp;
6353       continue;
6354     }
6355     std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6356   }
6357 
6358   EVT LoVT, HiVT;
6359   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6360   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6361                      DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6362                      DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6363 }
6364 
6365 /// Break an unary integer operation into 2 half sized ops and then
6366 /// concatenate the result back.
6367 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6368   // Make sure we only try to split 256/512-bit types to avoid creating
6369   // narrow vectors.
6370   EVT VT = Op.getValueType();
6371   (void)VT;
6372   assert((Op.getOperand(0).getValueType().is256BitVector() ||
6373           Op.getOperand(0).getValueType().is512BitVector()) &&
6374          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6375   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
6376              VT.getVectorNumElements() &&
6377          "Unexpected VTs!");
6378   return splitVectorOp(Op, DAG);
6379 }
6380 
6381 /// Break a binary integer operation into 2 half sized ops and then
6382 /// concatenate the result back.
6383 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6384   // Assert that all the types match.
6385   EVT VT = Op.getValueType();
6386   (void)VT;
6387   assert(Op.getOperand(0).getValueType() == VT &&
6388          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
6389   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6390   return splitVectorOp(Op, DAG);
6391 }
6392 
6393 // Helper for splitting operands of an operation to legal target size and
6394 // apply a function on each part.
6395 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6396 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6397 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6398 // The argument Builder is a function that will be applied on each split part:
6399 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6400 template <typename F>
6401 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6402                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6403                          F Builder, bool CheckBWI = true) {
6404   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
6405   unsigned NumSubs = 1;
6406   if ((CheckBWI && Subtarget.useBWIRegs()) ||
6407       (!CheckBWI && Subtarget.useAVX512Regs())) {
6408     if (VT.getSizeInBits() > 512) {
6409       NumSubs = VT.getSizeInBits() / 512;
6410       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
6411     }
6412   } else if (Subtarget.hasAVX2()) {
6413     if (VT.getSizeInBits() > 256) {
6414       NumSubs = VT.getSizeInBits() / 256;
6415       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
6416     }
6417   } else {
6418     if (VT.getSizeInBits() > 128) {
6419       NumSubs = VT.getSizeInBits() / 128;
6420       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
6421     }
6422   }
6423 
6424   if (NumSubs == 1)
6425     return Builder(DAG, DL, Ops);
6426 
6427   SmallVector<SDValue, 4> Subs;
6428   for (unsigned i = 0; i != NumSubs; ++i) {
6429     SmallVector<SDValue, 2> SubOps;
6430     for (SDValue Op : Ops) {
6431       EVT OpVT = Op.getValueType();
6432       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6433       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6434       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6435     }
6436     Subs.push_back(Builder(DAG, DL, SubOps));
6437   }
6438   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6439 }
6440 
6441 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6442 // targets.
6443 static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6444                              ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6445                              const X86Subtarget &Subtarget) {
6446   assert(Subtarget.hasAVX512() && "AVX512 target expected");
6447   MVT SVT = VT.getScalarType();
6448 
6449   // If we have a 32/64 splatted constant, splat it to DstTy to
6450   // encourage a foldable broadcast'd operand.
6451   auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6452     unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6453     // AVX512 broadcasts 32/64-bit operands.
6454     // TODO: Support float once getAVX512Node is used by fp-ops.
6455     if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6456         !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6457       return SDValue();
6458     // If we're not widening, don't bother if we're not bitcasting.
6459     if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6460       return SDValue();
6461     if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6462       APInt SplatValue, SplatUndef;
6463       unsigned SplatBitSize;
6464       bool HasAnyUndefs;
6465       if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6466                               HasAnyUndefs, OpEltSizeInBits) &&
6467           !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6468         return DAG.getConstant(SplatValue, DL, DstVT);
6469     }
6470     return SDValue();
6471   };
6472 
6473   bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6474 
6475   MVT DstVT = VT;
6476   if (Widen)
6477     DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6478 
6479   // Canonicalize src operands.
6480   SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6481   for (SDValue &Op : SrcOps) {
6482     MVT OpVT = Op.getSimpleValueType();
6483     // Just pass through scalar operands.
6484     if (!OpVT.isVector())
6485       continue;
6486     assert(OpVT == VT && "Vector type mismatch");
6487 
6488     if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6489       Op = BroadcastOp;
6490       continue;
6491     }
6492 
6493     // Just widen the subvector by inserting into an undef wide vector.
6494     if (Widen)
6495       Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6496   }
6497 
6498   SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6499 
6500   // Perform the 512-bit op then extract the bottom subvector.
6501   if (Widen)
6502     Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6503   return Res;
6504 }
6505 
6506 /// Insert i1-subvector to i1-vector.
6507 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6508                                 const X86Subtarget &Subtarget) {
6509 
6510   SDLoc dl(Op);
6511   SDValue Vec = Op.getOperand(0);
6512   SDValue SubVec = Op.getOperand(1);
6513   SDValue Idx = Op.getOperand(2);
6514   unsigned IdxVal = Op.getConstantOperandVal(2);
6515 
6516   // Inserting undef is a nop. We can just return the original vector.
6517   if (SubVec.isUndef())
6518     return Vec;
6519 
6520   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6521     return Op;
6522 
6523   MVT OpVT = Op.getSimpleValueType();
6524   unsigned NumElems = OpVT.getVectorNumElements();
6525   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6526 
6527   // Extend to natively supported kshift.
6528   MVT WideOpVT = OpVT;
6529   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6530     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6531 
6532   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6533   // if necessary.
6534   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6535     // May need to promote to a legal type.
6536     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6537                      DAG.getConstant(0, dl, WideOpVT),
6538                      SubVec, Idx);
6539     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6540   }
6541 
6542   MVT SubVecVT = SubVec.getSimpleValueType();
6543   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6544   assert(IdxVal + SubVecNumElems <= NumElems &&
6545          IdxVal % SubVecVT.getSizeInBits() == 0 &&
6546          "Unexpected index value in INSERT_SUBVECTOR");
6547 
6548   SDValue Undef = DAG.getUNDEF(WideOpVT);
6549 
6550   if (IdxVal == 0) {
6551     // Zero lower bits of the Vec
6552     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6553     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6554                       ZeroIdx);
6555     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6556     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6557     // Merge them together, SubVec should be zero extended.
6558     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6559                          DAG.getConstant(0, dl, WideOpVT),
6560                          SubVec, ZeroIdx);
6561     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6562     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6563   }
6564 
6565   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6566                        Undef, SubVec, ZeroIdx);
6567 
6568   if (Vec.isUndef()) {
6569     assert(IdxVal != 0 && "Unexpected index");
6570     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6571                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6572     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6573   }
6574 
6575   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6576     assert(IdxVal != 0 && "Unexpected index");
6577     // If upper elements of Vec are known undef, then just shift into place.
6578     if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6579                      [](SDValue V) { return V.isUndef(); })) {
6580       SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6581                            DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6582     } else {
6583       NumElems = WideOpVT.getVectorNumElements();
6584       unsigned ShiftLeft = NumElems - SubVecNumElems;
6585       unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6586       SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6587                            DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6588       if (ShiftRight != 0)
6589         SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6590                              DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6591     }
6592     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6593   }
6594 
6595   // Simple case when we put subvector in the upper part
6596   if (IdxVal + SubVecNumElems == NumElems) {
6597     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6598                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6599     if (SubVecNumElems * 2 == NumElems) {
6600       // Special case, use legal zero extending insert_subvector. This allows
6601       // isel to optimize when bits are known zero.
6602       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6603       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6604                         DAG.getConstant(0, dl, WideOpVT),
6605                         Vec, ZeroIdx);
6606     } else {
6607       // Otherwise use explicit shifts to zero the bits.
6608       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6609                         Undef, Vec, ZeroIdx);
6610       NumElems = WideOpVT.getVectorNumElements();
6611       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6612       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6613       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6614     }
6615     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6616     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6617   }
6618 
6619   // Inserting into the middle is more complicated.
6620 
6621   NumElems = WideOpVT.getVectorNumElements();
6622 
6623   // Widen the vector if needed.
6624   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6625 
6626   unsigned ShiftLeft = NumElems - SubVecNumElems;
6627   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6628 
6629   // Do an optimization for the the most frequently used types.
6630   if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6631     APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6632     Mask0.flipAllBits();
6633     SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6634     SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6635     Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6636     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6637                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6638     SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6639                          DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6640     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6641 
6642     // Reduce to original width if needed.
6643     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6644   }
6645 
6646   // Clear the upper bits of the subvector and move it to its insert position.
6647   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6648                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6649   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6650                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6651 
6652   // Isolate the bits below the insertion point.
6653   unsigned LowShift = NumElems - IdxVal;
6654   SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6655                             DAG.getTargetConstant(LowShift, dl, MVT::i8));
6656   Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6657                     DAG.getTargetConstant(LowShift, dl, MVT::i8));
6658 
6659   // Isolate the bits after the last inserted bit.
6660   unsigned HighShift = IdxVal + SubVecNumElems;
6661   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6662                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
6663   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6664                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
6665 
6666   // Now OR all 3 pieces together.
6667   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6668   SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6669 
6670   // Reduce to original width if needed.
6671   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6672 }
6673 
6674 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6675                                 const SDLoc &dl) {
6676   assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
6677   EVT SubVT = V1.getValueType();
6678   EVT SubSVT = SubVT.getScalarType();
6679   unsigned SubNumElts = SubVT.getVectorNumElements();
6680   unsigned SubVectorWidth = SubVT.getSizeInBits();
6681   EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6682   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6683   return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6684 }
6685 
6686 /// Returns a vector of specified type with all bits set.
6687 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6688 /// Then bitcast to their original type, ensuring they get CSE'd.
6689 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6690   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6691          "Expected a 128/256/512-bit vector type");
6692 
6693   APInt Ones = APInt::getAllOnes(32);
6694   unsigned NumElts = VT.getSizeInBits() / 32;
6695   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6696   return DAG.getBitcast(VT, Vec);
6697 }
6698 
6699 // Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6700 static unsigned getOpcode_EXTEND(unsigned Opcode) {
6701   switch (Opcode) {
6702   case ISD::ANY_EXTEND:
6703   case ISD::ANY_EXTEND_VECTOR_INREG:
6704     return ISD::ANY_EXTEND;
6705   case ISD::ZERO_EXTEND:
6706   case ISD::ZERO_EXTEND_VECTOR_INREG:
6707     return ISD::ZERO_EXTEND;
6708   case ISD::SIGN_EXTEND:
6709   case ISD::SIGN_EXTEND_VECTOR_INREG:
6710     return ISD::SIGN_EXTEND;
6711   }
6712   llvm_unreachable("Unknown opcode");
6713 }
6714 
6715 // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6716 static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6717   switch (Opcode) {
6718   case ISD::ANY_EXTEND:
6719   case ISD::ANY_EXTEND_VECTOR_INREG:
6720     return ISD::ANY_EXTEND_VECTOR_INREG;
6721   case ISD::ZERO_EXTEND:
6722   case ISD::ZERO_EXTEND_VECTOR_INREG:
6723     return ISD::ZERO_EXTEND_VECTOR_INREG;
6724   case ISD::SIGN_EXTEND:
6725   case ISD::SIGN_EXTEND_VECTOR_INREG:
6726     return ISD::SIGN_EXTEND_VECTOR_INREG;
6727   }
6728   llvm_unreachable("Unknown opcode");
6729 }
6730 
6731 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6732                                       SDValue In, SelectionDAG &DAG) {
6733   EVT InVT = In.getValueType();
6734   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
6735   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
6736           ISD::ZERO_EXTEND == Opcode) &&
6737          "Unknown extension opcode");
6738 
6739   // For 256-bit vectors, we only need the lower (128-bit) input half.
6740   // For 512-bit vectors, we only need the lower input half or quarter.
6741   if (InVT.getSizeInBits() > 128) {
6742     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
6743            "Expected VTs to be the same size!");
6744     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6745     In = extractSubVector(In, 0, DAG, DL,
6746                           std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6747     InVT = In.getValueType();
6748   }
6749 
6750   if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6751     Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6752 
6753   return DAG.getNode(Opcode, DL, VT, In);
6754 }
6755 
6756 // Match (xor X, -1) -> X.
6757 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
6758 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6759 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6760   V = peekThroughBitcasts(V);
6761   if (V.getOpcode() == ISD::XOR &&
6762       ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6763     return V.getOperand(0);
6764   if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6765       (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6766     if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6767       Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6768       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6769                          Not, V.getOperand(1));
6770     }
6771   }
6772   SmallVector<SDValue, 2> CatOps;
6773   if (collectConcatOps(V.getNode(), CatOps)) {
6774     for (SDValue &CatOp : CatOps) {
6775       SDValue NotCat = IsNOT(CatOp, DAG);
6776       if (!NotCat) return SDValue();
6777       CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6778     }
6779     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6780   }
6781   return SDValue();
6782 }
6783 
6784 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6785                                    bool Lo, bool Unary) {
6786   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
6787          "Illegal vector type to unpack");
6788   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6789   int NumElts = VT.getVectorNumElements();
6790   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6791   for (int i = 0; i < NumElts; ++i) {
6792     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6793     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6794     Pos += (Unary ? 0 : NumElts * (i % 2));
6795     Pos += (Lo ? 0 : NumEltsInLane / 2);
6796     Mask.push_back(Pos);
6797   }
6798 }
6799 
6800 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6801 /// imposed by AVX and specific to the unary pattern. Example:
6802 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6803 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6804 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6805                                    bool Lo) {
6806   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6807   int NumElts = VT.getVectorNumElements();
6808   for (int i = 0; i < NumElts; ++i) {
6809     int Pos = i / 2;
6810     Pos += (Lo ? 0 : NumElts / 2);
6811     Mask.push_back(Pos);
6812   }
6813 }
6814 
6815 // Attempt to constant fold, else just create a VECTOR_SHUFFLE.
6816 static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
6817                                 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
6818   if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
6819       (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
6820     SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
6821     for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
6822       int M = Mask[I];
6823       if (M < 0)
6824         continue;
6825       SDValue V = (M < NumElts) ? V1 : V2;
6826       if (V.isUndef())
6827         continue;
6828       Ops[I] = V.getOperand(M % NumElts);
6829     }
6830     return DAG.getBuildVector(VT, dl, Ops);
6831   }
6832 
6833   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6834 }
6835 
6836 /// Returns a vector_shuffle node for an unpackl operation.
6837 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6838                           SDValue V1, SDValue V2) {
6839   SmallVector<int, 8> Mask;
6840   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6841   return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
6842 }
6843 
6844 /// Returns a vector_shuffle node for an unpackh operation.
6845 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6846                           SDValue V1, SDValue V2) {
6847   SmallVector<int, 8> Mask;
6848   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6849   return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
6850 }
6851 
6852 /// Returns a node that packs the LHS + RHS nodes together at half width.
6853 /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
6854 /// TODO: Add subvector splitting if/when we have a need for it.
6855 static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6856                        const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
6857                        bool PackHiHalf = false) {
6858   MVT OpVT = LHS.getSimpleValueType();
6859   unsigned EltSizeInBits = VT.getScalarSizeInBits();
6860   bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
6861   assert(OpVT == RHS.getSimpleValueType() &&
6862          VT.getSizeInBits() == OpVT.getSizeInBits() &&
6863          (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
6864          "Unexpected PACK operand types");
6865   assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
6866          "Unexpected PACK result type");
6867 
6868   // Rely on vector shuffles for vXi64 -> vXi32 packing.
6869   if (EltSizeInBits == 32) {
6870     SmallVector<int> PackMask;
6871     int Offset = PackHiHalf ? 1 : 0;
6872     int NumElts = VT.getVectorNumElements();
6873     for (int I = 0; I != NumElts; I += 4) {
6874       PackMask.push_back(I + Offset);
6875       PackMask.push_back(I + Offset + 2);
6876       PackMask.push_back(I + Offset + NumElts);
6877       PackMask.push_back(I + Offset + NumElts + 2);
6878     }
6879     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
6880                                 DAG.getBitcast(VT, RHS), PackMask);
6881   }
6882 
6883   // See if we already have sufficient leading bits for PACKSS/PACKUS.
6884   if (!PackHiHalf) {
6885     if (UsePackUS &&
6886         DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
6887         DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
6888       return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
6889 
6890     if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
6891         DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
6892       return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
6893   }
6894 
6895   // Fallback to sign/zero extending the requested half and pack.
6896   SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
6897   if (UsePackUS) {
6898     if (PackHiHalf) {
6899       LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
6900       RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
6901     } else {
6902       SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
6903       LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
6904       RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
6905     };
6906     return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
6907   };
6908 
6909   if (!PackHiHalf) {
6910     LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
6911     RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
6912   }
6913   LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
6914   RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
6915   return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
6916 }
6917 
6918 /// Return a vector_shuffle of the specified vector of zero or undef vector.
6919 /// This produces a shuffle where the low element of V2 is swizzled into the
6920 /// zero/undef vector, landing at element Idx.
6921 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
6922 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6923                                            bool IsZero,
6924                                            const X86Subtarget &Subtarget,
6925                                            SelectionDAG &DAG) {
6926   MVT VT = V2.getSimpleValueType();
6927   SDValue V1 = IsZero
6928     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6929   int NumElems = VT.getVectorNumElements();
6930   SmallVector<int, 16> MaskVec(NumElems);
6931   for (int i = 0; i != NumElems; ++i)
6932     // If this is the insertion idx, put the low elt of V2 here.
6933     MaskVec[i] = (i == Idx) ? NumElems : i;
6934   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6935 }
6936 
6937 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6938   if (Ptr.getOpcode() == X86ISD::Wrapper ||
6939       Ptr.getOpcode() == X86ISD::WrapperRIP)
6940     Ptr = Ptr.getOperand(0);
6941 
6942   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6943   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6944     return nullptr;
6945 
6946   return CNode->getConstVal();
6947 }
6948 
6949 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6950   if (!Load || !ISD::isNormalLoad(Load))
6951     return nullptr;
6952   return getTargetConstantFromBasePtr(Load->getBasePtr());
6953 }
6954 
6955 static const Constant *getTargetConstantFromNode(SDValue Op) {
6956   Op = peekThroughBitcasts(Op);
6957   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6958 }
6959 
6960 const Constant *
6961 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6962   assert(LD && "Unexpected null LoadSDNode");
6963   return getTargetConstantFromNode(LD);
6964 }
6965 
6966 // Extract raw constant bits from constant pools.
6967 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6968                                           APInt &UndefElts,
6969                                           SmallVectorImpl<APInt> &EltBits,
6970                                           bool AllowWholeUndefs = true,
6971                                           bool AllowPartialUndefs = true) {
6972   assert(EltBits.empty() && "Expected an empty EltBits vector");
6973 
6974   Op = peekThroughBitcasts(Op);
6975 
6976   EVT VT = Op.getValueType();
6977   unsigned SizeInBits = VT.getSizeInBits();
6978   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
6979   unsigned NumElts = SizeInBits / EltSizeInBits;
6980 
6981   // Bitcast a source array of element bits to the target size.
6982   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6983     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6984     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6985     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
6986            "Constant bit sizes don't match");
6987 
6988     // Don't split if we don't allow undef bits.
6989     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6990     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6991       return false;
6992 
6993     // If we're already the right size, don't bother bitcasting.
6994     if (NumSrcElts == NumElts) {
6995       UndefElts = UndefSrcElts;
6996       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6997       return true;
6998     }
6999 
7000     // Extract all the undef/constant element data and pack into single bitsets.
7001     APInt UndefBits(SizeInBits, 0);
7002     APInt MaskBits(SizeInBits, 0);
7003 
7004     for (unsigned i = 0; i != NumSrcElts; ++i) {
7005       unsigned BitOffset = i * SrcEltSizeInBits;
7006       if (UndefSrcElts[i])
7007         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7008       MaskBits.insertBits(SrcEltBits[i], BitOffset);
7009     }
7010 
7011     // Split the undef/constant single bitset data into the target elements.
7012     UndefElts = APInt(NumElts, 0);
7013     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7014 
7015     for (unsigned i = 0; i != NumElts; ++i) {
7016       unsigned BitOffset = i * EltSizeInBits;
7017       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7018 
7019       // Only treat an element as UNDEF if all bits are UNDEF.
7020       if (UndefEltBits.isAllOnes()) {
7021         if (!AllowWholeUndefs)
7022           return false;
7023         UndefElts.setBit(i);
7024         continue;
7025       }
7026 
7027       // If only some bits are UNDEF then treat them as zero (or bail if not
7028       // supported).
7029       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7030         return false;
7031 
7032       EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7033     }
7034     return true;
7035   };
7036 
7037   // Collect constant bits and insert into mask/undef bit masks.
7038   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7039                                 unsigned UndefBitIndex) {
7040     if (!Cst)
7041       return false;
7042     if (isa<UndefValue>(Cst)) {
7043       Undefs.setBit(UndefBitIndex);
7044       return true;
7045     }
7046     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7047       Mask = CInt->getValue();
7048       return true;
7049     }
7050     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7051       Mask = CFP->getValueAPF().bitcastToAPInt();
7052       return true;
7053     }
7054     return false;
7055   };
7056 
7057   // Handle UNDEFs.
7058   if (Op.isUndef()) {
7059     APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7060     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7061     return CastBitData(UndefSrcElts, SrcEltBits);
7062   }
7063 
7064   // Extract scalar constant bits.
7065   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7066     APInt UndefSrcElts = APInt::getZero(1);
7067     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7068     return CastBitData(UndefSrcElts, SrcEltBits);
7069   }
7070   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7071     APInt UndefSrcElts = APInt::getZero(1);
7072     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7073     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7074     return CastBitData(UndefSrcElts, SrcEltBits);
7075   }
7076 
7077   // Extract constant bits from build vector.
7078   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7079     BitVector Undefs;
7080     SmallVector<APInt> SrcEltBits;
7081     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7082     if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7083       APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());
7084       for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7085         if (Undefs[I])
7086           UndefSrcElts.setBit(I);
7087       return CastBitData(UndefSrcElts, SrcEltBits);
7088     }
7089   }
7090 
7091   // Extract constant bits from constant pool vector.
7092   if (auto *Cst = getTargetConstantFromNode(Op)) {
7093     Type *CstTy = Cst->getType();
7094     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7095     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7096       return false;
7097 
7098     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7099     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7100 
7101     APInt UndefSrcElts(NumSrcElts, 0);
7102     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7103     for (unsigned i = 0; i != NumSrcElts; ++i)
7104       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7105                                UndefSrcElts, i))
7106         return false;
7107 
7108     return CastBitData(UndefSrcElts, SrcEltBits);
7109   }
7110 
7111   // Extract constant bits from a broadcasted constant pool scalar.
7112   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7113       EltSizeInBits <= VT.getScalarSizeInBits()) {
7114     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7115     if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7116       return false;
7117 
7118     SDValue Ptr = MemIntr->getBasePtr();
7119     if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7120       unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7121       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7122 
7123       APInt UndefSrcElts(NumSrcElts, 0);
7124       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7125       if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7126         if (UndefSrcElts[0])
7127           UndefSrcElts.setBits(0, NumSrcElts);
7128         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7129         return CastBitData(UndefSrcElts, SrcEltBits);
7130       }
7131     }
7132   }
7133 
7134   // Extract constant bits from a subvector broadcast.
7135   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7136     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7137     SDValue Ptr = MemIntr->getBasePtr();
7138     // The source constant may be larger than the subvector broadcast,
7139     // ensure we extract the correct subvector constants.
7140     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7141       Type *CstTy = Cst->getType();
7142       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7143       unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7144       if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7145           (SizeInBits % SubVecSizeInBits) != 0)
7146         return false;
7147       unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7148       unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7149       unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7150       APInt UndefSubElts(NumSubElts, 0);
7151       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7152                                         APInt(CstEltSizeInBits, 0));
7153       for (unsigned i = 0; i != NumSubElts; ++i) {
7154         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7155                                  UndefSubElts, i))
7156           return false;
7157         for (unsigned j = 1; j != NumSubVecs; ++j)
7158           SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7159       }
7160       UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7161                                      UndefSubElts);
7162       return CastBitData(UndefSubElts, SubEltBits);
7163     }
7164   }
7165 
7166   // Extract a rematerialized scalar constant insertion.
7167   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7168       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7169       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7170     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7171     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7172 
7173     APInt UndefSrcElts(NumSrcElts, 0);
7174     SmallVector<APInt, 64> SrcEltBits;
7175     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7176     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7177     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7178     return CastBitData(UndefSrcElts, SrcEltBits);
7179   }
7180 
7181   // Insert constant bits from a base and sub vector sources.
7182   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7183     // If bitcasts to larger elements we might lose track of undefs - don't
7184     // allow any to be safe.
7185     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7186     bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7187 
7188     APInt UndefSrcElts, UndefSubElts;
7189     SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7190     if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7191                                       UndefSubElts, EltSubBits,
7192                                       AllowWholeUndefs && AllowUndefs,
7193                                       AllowPartialUndefs && AllowUndefs) &&
7194         getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7195                                       UndefSrcElts, EltSrcBits,
7196                                       AllowWholeUndefs && AllowUndefs,
7197                                       AllowPartialUndefs && AllowUndefs)) {
7198       unsigned BaseIdx = Op.getConstantOperandVal(2);
7199       UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7200       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7201         EltSrcBits[BaseIdx + i] = EltSubBits[i];
7202       return CastBitData(UndefSrcElts, EltSrcBits);
7203     }
7204   }
7205 
7206   // Extract constant bits from a subvector's source.
7207   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7208     // TODO - support extract_subvector through bitcasts.
7209     if (EltSizeInBits != VT.getScalarSizeInBits())
7210       return false;
7211 
7212     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7213                                       UndefElts, EltBits, AllowWholeUndefs,
7214                                       AllowPartialUndefs)) {
7215       EVT SrcVT = Op.getOperand(0).getValueType();
7216       unsigned NumSrcElts = SrcVT.getVectorNumElements();
7217       unsigned NumSubElts = VT.getVectorNumElements();
7218       unsigned BaseIdx = Op.getConstantOperandVal(1);
7219       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7220       if ((BaseIdx + NumSubElts) != NumSrcElts)
7221         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7222       if (BaseIdx != 0)
7223         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7224       return true;
7225     }
7226   }
7227 
7228   // Extract constant bits from shuffle node sources.
7229   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7230     // TODO - support shuffle through bitcasts.
7231     if (EltSizeInBits != VT.getScalarSizeInBits())
7232       return false;
7233 
7234     ArrayRef<int> Mask = SVN->getMask();
7235     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7236         llvm::any_of(Mask, [](int M) { return M < 0; }))
7237       return false;
7238 
7239     APInt UndefElts0, UndefElts1;
7240     SmallVector<APInt, 32> EltBits0, EltBits1;
7241     if (isAnyInRange(Mask, 0, NumElts) &&
7242         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7243                                        UndefElts0, EltBits0, AllowWholeUndefs,
7244                                        AllowPartialUndefs))
7245       return false;
7246     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7247         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7248                                        UndefElts1, EltBits1, AllowWholeUndefs,
7249                                        AllowPartialUndefs))
7250       return false;
7251 
7252     UndefElts = APInt::getZero(NumElts);
7253     for (int i = 0; i != (int)NumElts; ++i) {
7254       int M = Mask[i];
7255       if (M < 0) {
7256         UndefElts.setBit(i);
7257         EltBits.push_back(APInt::getZero(EltSizeInBits));
7258       } else if (M < (int)NumElts) {
7259         if (UndefElts0[M])
7260           UndefElts.setBit(i);
7261         EltBits.push_back(EltBits0[M]);
7262       } else {
7263         if (UndefElts1[M - NumElts])
7264           UndefElts.setBit(i);
7265         EltBits.push_back(EltBits1[M - NumElts]);
7266       }
7267     }
7268     return true;
7269   }
7270 
7271   return false;
7272 }
7273 
7274 namespace llvm {
7275 namespace X86 {
7276 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7277   APInt UndefElts;
7278   SmallVector<APInt, 16> EltBits;
7279   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7280                                     UndefElts, EltBits, true,
7281                                     AllowPartialUndefs)) {
7282     int SplatIndex = -1;
7283     for (int i = 0, e = EltBits.size(); i != e; ++i) {
7284       if (UndefElts[i])
7285         continue;
7286       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7287         SplatIndex = -1;
7288         break;
7289       }
7290       SplatIndex = i;
7291     }
7292     if (0 <= SplatIndex) {
7293       SplatVal = EltBits[SplatIndex];
7294       return true;
7295     }
7296   }
7297 
7298   return false;
7299 }
7300 } // namespace X86
7301 } // namespace llvm
7302 
7303 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7304                                         unsigned MaskEltSizeInBits,
7305                                         SmallVectorImpl<uint64_t> &RawMask,
7306                                         APInt &UndefElts) {
7307   // Extract the raw target constant bits.
7308   SmallVector<APInt, 64> EltBits;
7309   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7310                                      EltBits, /* AllowWholeUndefs */ true,
7311                                      /* AllowPartialUndefs */ false))
7312     return false;
7313 
7314   // Insert the extracted elements into the mask.
7315   for (const APInt &Elt : EltBits)
7316     RawMask.push_back(Elt.getZExtValue());
7317 
7318   return true;
7319 }
7320 
7321 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7322 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7323 /// Note: This ignores saturation, so inputs must be checked first.
7324 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7325                                   bool Unary, unsigned NumStages = 1) {
7326   assert(Mask.empty() && "Expected an empty shuffle mask vector");
7327   unsigned NumElts = VT.getVectorNumElements();
7328   unsigned NumLanes = VT.getSizeInBits() / 128;
7329   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7330   unsigned Offset = Unary ? 0 : NumElts;
7331   unsigned Repetitions = 1u << (NumStages - 1);
7332   unsigned Increment = 1u << NumStages;
7333   assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
7334 
7335   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7336     for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7337       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7338         Mask.push_back(Elt + (Lane * NumEltsPerLane));
7339       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7340         Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7341     }
7342   }
7343 }
7344 
7345 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
7346 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7347                                 APInt &DemandedLHS, APInt &DemandedRHS) {
7348   int NumLanes = VT.getSizeInBits() / 128;
7349   int NumElts = DemandedElts.getBitWidth();
7350   int NumInnerElts = NumElts / 2;
7351   int NumEltsPerLane = NumElts / NumLanes;
7352   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7353 
7354   DemandedLHS = APInt::getZero(NumInnerElts);
7355   DemandedRHS = APInt::getZero(NumInnerElts);
7356 
7357   // Map DemandedElts to the packed operands.
7358   for (int Lane = 0; Lane != NumLanes; ++Lane) {
7359     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7360       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7361       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7362       if (DemandedElts[OuterIdx])
7363         DemandedLHS.setBit(InnerIdx);
7364       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7365         DemandedRHS.setBit(InnerIdx);
7366     }
7367   }
7368 }
7369 
7370 // Split the demanded elts of a HADD/HSUB node between its operands.
7371 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7372                                  APInt &DemandedLHS, APInt &DemandedRHS) {
7373   int NumLanes = VT.getSizeInBits() / 128;
7374   int NumElts = DemandedElts.getBitWidth();
7375   int NumEltsPerLane = NumElts / NumLanes;
7376   int HalfEltsPerLane = NumEltsPerLane / 2;
7377 
7378   DemandedLHS = APInt::getZero(NumElts);
7379   DemandedRHS = APInt::getZero(NumElts);
7380 
7381   // Map DemandedElts to the horizontal operands.
7382   for (int Idx = 0; Idx != NumElts; ++Idx) {
7383     if (!DemandedElts[Idx])
7384       continue;
7385     int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7386     int LocalIdx = Idx % NumEltsPerLane;
7387     if (LocalIdx < HalfEltsPerLane) {
7388       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7389       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7390     } else {
7391       LocalIdx -= HalfEltsPerLane;
7392       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7393       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7394     }
7395   }
7396 }
7397 
7398 /// Calculates the shuffle mask corresponding to the target-specific opcode.
7399 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7400 /// operands in \p Ops, and returns true.
7401 /// Sets \p IsUnary to true if only one source is used. Note that this will set
7402 /// IsUnary for shuffles which use a single input multiple times, and in those
7403 /// cases it will adjust the mask to only have indices within that single input.
7404 /// It is an error to call this with non-empty Mask/Ops vectors.
7405 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7406                                  SmallVectorImpl<SDValue> &Ops,
7407                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
7408   unsigned NumElems = VT.getVectorNumElements();
7409   unsigned MaskEltSize = VT.getScalarSizeInBits();
7410   SmallVector<uint64_t, 32> RawMask;
7411   APInt RawUndefs;
7412   uint64_t ImmN;
7413 
7414   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
7415   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
7416 
7417   IsUnary = false;
7418   bool IsFakeUnary = false;
7419   switch (N->getOpcode()) {
7420   case X86ISD::BLENDI:
7421     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7422     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7423     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7424     DecodeBLENDMask(NumElems, ImmN, Mask);
7425     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7426     break;
7427   case X86ISD::SHUFP:
7428     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7429     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7430     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7431     DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7432     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7433     break;
7434   case X86ISD::INSERTPS:
7435     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7436     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7437     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7438     DecodeINSERTPSMask(ImmN, Mask);
7439     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7440     break;
7441   case X86ISD::EXTRQI:
7442     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7443     if (isa<ConstantSDNode>(N->getOperand(1)) &&
7444         isa<ConstantSDNode>(N->getOperand(2))) {
7445       int BitLen = N->getConstantOperandVal(1);
7446       int BitIdx = N->getConstantOperandVal(2);
7447       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7448       IsUnary = true;
7449     }
7450     break;
7451   case X86ISD::INSERTQI:
7452     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7453     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7454     if (isa<ConstantSDNode>(N->getOperand(2)) &&
7455         isa<ConstantSDNode>(N->getOperand(3))) {
7456       int BitLen = N->getConstantOperandVal(2);
7457       int BitIdx = N->getConstantOperandVal(3);
7458       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7459       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7460     }
7461     break;
7462   case X86ISD::UNPCKH:
7463     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7464     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7465     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7466     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7467     break;
7468   case X86ISD::UNPCKL:
7469     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7470     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7471     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7472     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7473     break;
7474   case X86ISD::MOVHLPS:
7475     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7476     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7477     DecodeMOVHLPSMask(NumElems, Mask);
7478     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7479     break;
7480   case X86ISD::MOVLHPS:
7481     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7482     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7483     DecodeMOVLHPSMask(NumElems, Mask);
7484     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7485     break;
7486   case X86ISD::VALIGN:
7487     assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
7488            "Only 32-bit and 64-bit elements are supported!");
7489     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7490     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7491     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7492     DecodeVALIGNMask(NumElems, ImmN, Mask);
7493     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7494     Ops.push_back(N->getOperand(1));
7495     Ops.push_back(N->getOperand(0));
7496     break;
7497   case X86ISD::PALIGNR:
7498     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7499     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7500     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7501     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7502     DecodePALIGNRMask(NumElems, ImmN, Mask);
7503     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7504     Ops.push_back(N->getOperand(1));
7505     Ops.push_back(N->getOperand(0));
7506     break;
7507   case X86ISD::VSHLDQ:
7508     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7509     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7510     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7511     DecodePSLLDQMask(NumElems, ImmN, Mask);
7512     IsUnary = true;
7513     break;
7514   case X86ISD::VSRLDQ:
7515     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7516     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7517     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7518     DecodePSRLDQMask(NumElems, ImmN, Mask);
7519     IsUnary = true;
7520     break;
7521   case X86ISD::PSHUFD:
7522   case X86ISD::VPERMILPI:
7523     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7524     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7525     DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7526     IsUnary = true;
7527     break;
7528   case X86ISD::PSHUFHW:
7529     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7530     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7531     DecodePSHUFHWMask(NumElems, ImmN, Mask);
7532     IsUnary = true;
7533     break;
7534   case X86ISD::PSHUFLW:
7535     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7536     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7537     DecodePSHUFLWMask(NumElems, ImmN, Mask);
7538     IsUnary = true;
7539     break;
7540   case X86ISD::VZEXT_MOVL:
7541     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7542     DecodeZeroMoveLowMask(NumElems, Mask);
7543     IsUnary = true;
7544     break;
7545   case X86ISD::VBROADCAST:
7546     // We only decode broadcasts of same-sized vectors, peeking through to
7547     // extracted subvectors is likely to cause hasOneUse issues with
7548     // SimplifyDemandedBits etc.
7549     if (N->getOperand(0).getValueType() == VT) {
7550       DecodeVectorBroadcast(NumElems, Mask);
7551       IsUnary = true;
7552       break;
7553     }
7554     return false;
7555   case X86ISD::VPERMILPV: {
7556     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7557     IsUnary = true;
7558     SDValue MaskNode = N->getOperand(1);
7559     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7560                                     RawUndefs)) {
7561       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7562       break;
7563     }
7564     return false;
7565   }
7566   case X86ISD::PSHUFB: {
7567     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7568     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7569     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7570     IsUnary = true;
7571     SDValue MaskNode = N->getOperand(1);
7572     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7573       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7574       break;
7575     }
7576     return false;
7577   }
7578   case X86ISD::VPERMI:
7579     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7580     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7581     DecodeVPERMMask(NumElems, ImmN, Mask);
7582     IsUnary = true;
7583     break;
7584   case X86ISD::MOVSS:
7585   case X86ISD::MOVSD:
7586   case X86ISD::MOVSH:
7587     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7588     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7589     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7590     break;
7591   case X86ISD::VPERM2X128:
7592     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7593     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7594     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7595     DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7596     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7597     break;
7598   case X86ISD::SHUF128:
7599     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7600     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7601     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7602     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7603     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7604     break;
7605   case X86ISD::MOVSLDUP:
7606     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7607     DecodeMOVSLDUPMask(NumElems, Mask);
7608     IsUnary = true;
7609     break;
7610   case X86ISD::MOVSHDUP:
7611     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7612     DecodeMOVSHDUPMask(NumElems, Mask);
7613     IsUnary = true;
7614     break;
7615   case X86ISD::MOVDDUP:
7616     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7617     DecodeMOVDDUPMask(NumElems, Mask);
7618     IsUnary = true;
7619     break;
7620   case X86ISD::VPERMIL2: {
7621     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7622     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7623     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7624     SDValue MaskNode = N->getOperand(2);
7625     SDValue CtrlNode = N->getOperand(3);
7626     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7627       unsigned CtrlImm = CtrlOp->getZExtValue();
7628       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7629                                       RawUndefs)) {
7630         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7631                             Mask);
7632         break;
7633       }
7634     }
7635     return false;
7636   }
7637   case X86ISD::VPPERM: {
7638     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7639     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7640     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7641     SDValue MaskNode = N->getOperand(2);
7642     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7643       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7644       break;
7645     }
7646     return false;
7647   }
7648   case X86ISD::VPERMV: {
7649     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7650     IsUnary = true;
7651     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7652     Ops.push_back(N->getOperand(1));
7653     SDValue MaskNode = N->getOperand(0);
7654     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7655                                     RawUndefs)) {
7656       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7657       break;
7658     }
7659     return false;
7660   }
7661   case X86ISD::VPERMV3: {
7662     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7663     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
7664     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7665     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7666     Ops.push_back(N->getOperand(0));
7667     Ops.push_back(N->getOperand(2));
7668     SDValue MaskNode = N->getOperand(1);
7669     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7670                                     RawUndefs)) {
7671       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7672       break;
7673     }
7674     return false;
7675   }
7676   default: llvm_unreachable("unknown target shuffle node");
7677   }
7678 
7679   // Empty mask indicates the decode failed.
7680   if (Mask.empty())
7681     return false;
7682 
7683   // Check if we're getting a shuffle mask with zero'd elements.
7684   if (!AllowSentinelZero && isAnyZero(Mask))
7685     return false;
7686 
7687   // If we have a fake unary shuffle, the shuffle mask is spread across two
7688   // inputs that are actually the same node. Re-map the mask to always point
7689   // into the first input.
7690   if (IsFakeUnary)
7691     for (int &M : Mask)
7692       if (M >= (int)Mask.size())
7693         M -= Mask.size();
7694 
7695   // If we didn't already add operands in the opcode-specific code, default to
7696   // adding 1 or 2 operands starting at 0.
7697   if (Ops.empty()) {
7698     Ops.push_back(N->getOperand(0));
7699     if (!IsUnary || IsFakeUnary)
7700       Ops.push_back(N->getOperand(1));
7701   }
7702 
7703   return true;
7704 }
7705 
7706 // Wrapper for getTargetShuffleMask with InUnary;
7707 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7708                                  SmallVectorImpl<SDValue> &Ops,
7709                                  SmallVectorImpl<int> &Mask) {
7710   bool IsUnary;
7711   return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7712 }
7713 
7714 /// Compute whether each element of a shuffle is zeroable.
7715 ///
7716 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7717 /// Either it is an undef element in the shuffle mask, the element of the input
7718 /// referenced is undef, or the element of the input referenced is known to be
7719 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7720 /// as many lanes with this technique as possible to simplify the remaining
7721 /// shuffle.
7722 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7723                                            SDValue V1, SDValue V2,
7724                                            APInt &KnownUndef, APInt &KnownZero) {
7725   int Size = Mask.size();
7726   KnownUndef = KnownZero = APInt::getZero(Size);
7727 
7728   V1 = peekThroughBitcasts(V1);
7729   V2 = peekThroughBitcasts(V2);
7730 
7731   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7732   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7733 
7734   int VectorSizeInBits = V1.getValueSizeInBits();
7735   int ScalarSizeInBits = VectorSizeInBits / Size;
7736   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7737 
7738   for (int i = 0; i < Size; ++i) {
7739     int M = Mask[i];
7740     // Handle the easy cases.
7741     if (M < 0) {
7742       KnownUndef.setBit(i);
7743       continue;
7744     }
7745     if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7746       KnownZero.setBit(i);
7747       continue;
7748     }
7749 
7750     // Determine shuffle input and normalize the mask.
7751     SDValue V = M < Size ? V1 : V2;
7752     M %= Size;
7753 
7754     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7755     if (V.getOpcode() != ISD::BUILD_VECTOR)
7756       continue;
7757 
7758     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7759     // the (larger) source element must be UNDEF/ZERO.
7760     if ((Size % V.getNumOperands()) == 0) {
7761       int Scale = Size / V->getNumOperands();
7762       SDValue Op = V.getOperand(M / Scale);
7763       if (Op.isUndef())
7764         KnownUndef.setBit(i);
7765       if (X86::isZeroNode(Op))
7766         KnownZero.setBit(i);
7767       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7768         APInt Val = Cst->getAPIntValue();
7769         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7770         if (Val == 0)
7771           KnownZero.setBit(i);
7772       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7773         APInt Val = Cst->getValueAPF().bitcastToAPInt();
7774         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7775         if (Val == 0)
7776           KnownZero.setBit(i);
7777       }
7778       continue;
7779     }
7780 
7781     // If the BUILD_VECTOR has more elements then all the (smaller) source
7782     // elements must be UNDEF or ZERO.
7783     if ((V.getNumOperands() % Size) == 0) {
7784       int Scale = V->getNumOperands() / Size;
7785       bool AllUndef = true;
7786       bool AllZero = true;
7787       for (int j = 0; j < Scale; ++j) {
7788         SDValue Op = V.getOperand((M * Scale) + j);
7789         AllUndef &= Op.isUndef();
7790         AllZero &= X86::isZeroNode(Op);
7791       }
7792       if (AllUndef)
7793         KnownUndef.setBit(i);
7794       if (AllZero)
7795         KnownZero.setBit(i);
7796       continue;
7797     }
7798   }
7799 }
7800 
7801 /// Decode a target shuffle mask and inputs and see if any values are
7802 /// known to be undef or zero from their inputs.
7803 /// Returns true if the target shuffle mask was decoded.
7804 /// FIXME: Merge this with computeZeroableShuffleElements?
7805 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7806                                          SmallVectorImpl<SDValue> &Ops,
7807                                          APInt &KnownUndef, APInt &KnownZero) {
7808   bool IsUnary;
7809   if (!isTargetShuffle(N.getOpcode()))
7810     return false;
7811 
7812   MVT VT = N.getSimpleValueType();
7813   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7814     return false;
7815 
7816   int Size = Mask.size();
7817   SDValue V1 = Ops[0];
7818   SDValue V2 = IsUnary ? V1 : Ops[1];
7819   KnownUndef = KnownZero = APInt::getZero(Size);
7820 
7821   V1 = peekThroughBitcasts(V1);
7822   V2 = peekThroughBitcasts(V2);
7823 
7824   assert((VT.getSizeInBits() % Size) == 0 &&
7825          "Illegal split of shuffle value type");
7826   unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7827 
7828   // Extract known constant input data.
7829   APInt UndefSrcElts[2];
7830   SmallVector<APInt, 32> SrcEltBits[2];
7831   bool IsSrcConstant[2] = {
7832       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7833                                     SrcEltBits[0], true, false),
7834       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7835                                     SrcEltBits[1], true, false)};
7836 
7837   for (int i = 0; i < Size; ++i) {
7838     int M = Mask[i];
7839 
7840     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7841     if (M < 0) {
7842       assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
7843       if (SM_SentinelUndef == M)
7844         KnownUndef.setBit(i);
7845       if (SM_SentinelZero == M)
7846         KnownZero.setBit(i);
7847       continue;
7848     }
7849 
7850     // Determine shuffle input and normalize the mask.
7851     unsigned SrcIdx = M / Size;
7852     SDValue V = M < Size ? V1 : V2;
7853     M %= Size;
7854 
7855     // We are referencing an UNDEF input.
7856     if (V.isUndef()) {
7857       KnownUndef.setBit(i);
7858       continue;
7859     }
7860 
7861     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7862     // TODO: We currently only set UNDEF for integer types - floats use the same
7863     // registers as vectors and many of the scalar folded loads rely on the
7864     // SCALAR_TO_VECTOR pattern.
7865     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7866         (Size % V.getValueType().getVectorNumElements()) == 0) {
7867       int Scale = Size / V.getValueType().getVectorNumElements();
7868       int Idx = M / Scale;
7869       if (Idx != 0 && !VT.isFloatingPoint())
7870         KnownUndef.setBit(i);
7871       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7872         KnownZero.setBit(i);
7873       continue;
7874     }
7875 
7876     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7877     // base vectors.
7878     if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7879       SDValue Vec = V.getOperand(0);
7880       int NumVecElts = Vec.getValueType().getVectorNumElements();
7881       if (Vec.isUndef() && Size == NumVecElts) {
7882         int Idx = V.getConstantOperandVal(2);
7883         int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7884         if (M < Idx || (Idx + NumSubElts) <= M)
7885           KnownUndef.setBit(i);
7886       }
7887       continue;
7888     }
7889 
7890     // Attempt to extract from the source's constant bits.
7891     if (IsSrcConstant[SrcIdx]) {
7892       if (UndefSrcElts[SrcIdx][M])
7893         KnownUndef.setBit(i);
7894       else if (SrcEltBits[SrcIdx][M] == 0)
7895         KnownZero.setBit(i);
7896     }
7897   }
7898 
7899   assert(VT.getVectorNumElements() == (unsigned)Size &&
7900          "Different mask size from vector size!");
7901   return true;
7902 }
7903 
7904 // Replace target shuffle mask elements with known undef/zero sentinels.
7905 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7906                                               const APInt &KnownUndef,
7907                                               const APInt &KnownZero,
7908                                               bool ResolveKnownZeros= true) {
7909   unsigned NumElts = Mask.size();
7910   assert(KnownUndef.getBitWidth() == NumElts &&
7911          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
7912 
7913   for (unsigned i = 0; i != NumElts; ++i) {
7914     if (KnownUndef[i])
7915       Mask[i] = SM_SentinelUndef;
7916     else if (ResolveKnownZeros && KnownZero[i])
7917       Mask[i] = SM_SentinelZero;
7918   }
7919 }
7920 
7921 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7922 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7923                                               APInt &KnownUndef,
7924                                               APInt &KnownZero) {
7925   unsigned NumElts = Mask.size();
7926   KnownUndef = KnownZero = APInt::getZero(NumElts);
7927 
7928   for (unsigned i = 0; i != NumElts; ++i) {
7929     int M = Mask[i];
7930     if (SM_SentinelUndef == M)
7931       KnownUndef.setBit(i);
7932     if (SM_SentinelZero == M)
7933       KnownZero.setBit(i);
7934   }
7935 }
7936 
7937 // Forward declaration (for getFauxShuffleMask recursive check).
7938 // TODO: Use DemandedElts variant.
7939 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7940                                    SmallVectorImpl<int> &Mask,
7941                                    const SelectionDAG &DAG, unsigned Depth,
7942                                    bool ResolveKnownElts);
7943 
7944 // Attempt to decode ops that could be represented as a shuffle mask.
7945 // The decoded shuffle mask may contain a different number of elements to the
7946 // destination value type.
7947 // TODO: Merge into getTargetShuffleInputs()
7948 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7949                                SmallVectorImpl<int> &Mask,
7950                                SmallVectorImpl<SDValue> &Ops,
7951                                const SelectionDAG &DAG, unsigned Depth,
7952                                bool ResolveKnownElts) {
7953   Mask.clear();
7954   Ops.clear();
7955 
7956   MVT VT = N.getSimpleValueType();
7957   unsigned NumElts = VT.getVectorNumElements();
7958   unsigned NumSizeInBits = VT.getSizeInBits();
7959   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7960   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7961     return false;
7962   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
7963   unsigned NumSizeInBytes = NumSizeInBits / 8;
7964   unsigned NumBytesPerElt = NumBitsPerElt / 8;
7965 
7966   unsigned Opcode = N.getOpcode();
7967   switch (Opcode) {
7968   case ISD::VECTOR_SHUFFLE: {
7969     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7970     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7971     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7972       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7973       Ops.push_back(N.getOperand(0));
7974       Ops.push_back(N.getOperand(1));
7975       return true;
7976     }
7977     return false;
7978   }
7979   case ISD::AND:
7980   case X86ISD::ANDNP: {
7981     // Attempt to decode as a per-byte mask.
7982     APInt UndefElts;
7983     SmallVector<APInt, 32> EltBits;
7984     SDValue N0 = N.getOperand(0);
7985     SDValue N1 = N.getOperand(1);
7986     bool IsAndN = (X86ISD::ANDNP == Opcode);
7987     uint64_t ZeroMask = IsAndN ? 255 : 0;
7988     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7989       return false;
7990     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7991       if (UndefElts[i]) {
7992         Mask.push_back(SM_SentinelUndef);
7993         continue;
7994       }
7995       const APInt &ByteBits = EltBits[i];
7996       if (ByteBits != 0 && ByteBits != 255)
7997         return false;
7998       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7999     }
8000     Ops.push_back(IsAndN ? N1 : N0);
8001     return true;
8002   }
8003   case ISD::OR: {
8004     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8005     // is a valid shuffle index.
8006     SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8007     SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8008     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8009       return false;
8010     SmallVector<int, 64> SrcMask0, SrcMask1;
8011     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8012     if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
8013                                 true) ||
8014         !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
8015                                 true))
8016       return false;
8017 
8018     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8019     SmallVector<int, 64> Mask0, Mask1;
8020     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8021     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8022     for (int i = 0; i != (int)MaskSize; ++i) {
8023       // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8024       // loops converting between OR and BLEND shuffles due to
8025       // canWidenShuffleElements merging away undef elements, meaning we
8026       // fail to recognise the OR as the undef element isn't known zero.
8027       if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8028         Mask.push_back(SM_SentinelZero);
8029       else if (Mask1[i] == SM_SentinelZero)
8030         Mask.push_back(i);
8031       else if (Mask0[i] == SM_SentinelZero)
8032         Mask.push_back(i + MaskSize);
8033       else
8034         return false;
8035     }
8036     Ops.push_back(N0);
8037     Ops.push_back(N1);
8038     return true;
8039   }
8040   case ISD::INSERT_SUBVECTOR: {
8041     SDValue Src = N.getOperand(0);
8042     SDValue Sub = N.getOperand(1);
8043     EVT SubVT = Sub.getValueType();
8044     unsigned NumSubElts = SubVT.getVectorNumElements();
8045     if (!N->isOnlyUserOf(Sub.getNode()))
8046       return false;
8047     uint64_t InsertIdx = N.getConstantOperandVal(2);
8048     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8049     if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8050         Sub.getOperand(0).getValueType() == VT) {
8051       uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8052       for (int i = 0; i != (int)NumElts; ++i)
8053         Mask.push_back(i);
8054       for (int i = 0; i != (int)NumSubElts; ++i)
8055         Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8056       Ops.push_back(Src);
8057       Ops.push_back(Sub.getOperand(0));
8058       return true;
8059     }
8060     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8061     SmallVector<int, 64> SubMask;
8062     SmallVector<SDValue, 2> SubInputs;
8063     if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
8064                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
8065       return false;
8066 
8067     // Subvector shuffle inputs must not be larger than the subvector.
8068     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8069           return SubVT.getFixedSizeInBits() <
8070                  SubInput.getValueSizeInBits().getFixedSize();
8071         }))
8072       return false;
8073 
8074     if (SubMask.size() != NumSubElts) {
8075       assert(((SubMask.size() % NumSubElts) == 0 ||
8076               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
8077       if ((NumSubElts % SubMask.size()) == 0) {
8078         int Scale = NumSubElts / SubMask.size();
8079         SmallVector<int,64> ScaledSubMask;
8080         narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8081         SubMask = ScaledSubMask;
8082       } else {
8083         int Scale = SubMask.size() / NumSubElts;
8084         NumSubElts = SubMask.size();
8085         NumElts *= Scale;
8086         InsertIdx *= Scale;
8087       }
8088     }
8089     Ops.push_back(Src);
8090     Ops.append(SubInputs.begin(), SubInputs.end());
8091     if (ISD::isBuildVectorAllZeros(Src.getNode()))
8092       Mask.append(NumElts, SM_SentinelZero);
8093     else
8094       for (int i = 0; i != (int)NumElts; ++i)
8095         Mask.push_back(i);
8096     for (int i = 0; i != (int)NumSubElts; ++i) {
8097       int M = SubMask[i];
8098       if (0 <= M) {
8099         int InputIdx = M / NumSubElts;
8100         M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8101       }
8102       Mask[i + InsertIdx] = M;
8103     }
8104     return true;
8105   }
8106   case X86ISD::PINSRB:
8107   case X86ISD::PINSRW:
8108   case ISD::SCALAR_TO_VECTOR:
8109   case ISD::INSERT_VECTOR_ELT: {
8110     // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8111     // vector, for matching src/dst vector types.
8112     SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8113 
8114     unsigned DstIdx = 0;
8115     if (Opcode != ISD::SCALAR_TO_VECTOR) {
8116       // Check we have an in-range constant insertion index.
8117       if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8118           N.getConstantOperandAPInt(2).uge(NumElts))
8119         return false;
8120       DstIdx = N.getConstantOperandVal(2);
8121 
8122       // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8123       if (X86::isZeroNode(Scl)) {
8124         Ops.push_back(N.getOperand(0));
8125         for (unsigned i = 0; i != NumElts; ++i)
8126           Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8127         return true;
8128       }
8129     }
8130 
8131     // Peek through trunc/aext/zext.
8132     // TODO: aext shouldn't require SM_SentinelZero padding.
8133     // TODO: handle shift of scalars.
8134     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8135     while (Scl.getOpcode() == ISD::TRUNCATE ||
8136            Scl.getOpcode() == ISD::ANY_EXTEND ||
8137            Scl.getOpcode() == ISD::ZERO_EXTEND) {
8138       Scl = Scl.getOperand(0);
8139       MinBitsPerElt =
8140           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8141     }
8142     if ((MinBitsPerElt % 8) != 0)
8143       return false;
8144 
8145     // Attempt to find the source vector the scalar was extracted from.
8146     SDValue SrcExtract;
8147     if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8148          Scl.getOpcode() == X86ISD::PEXTRW ||
8149          Scl.getOpcode() == X86ISD::PEXTRB) &&
8150         Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8151       SrcExtract = Scl;
8152     }
8153     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8154       return false;
8155 
8156     SDValue SrcVec = SrcExtract.getOperand(0);
8157     EVT SrcVT = SrcVec.getValueType();
8158     if (!SrcVT.getScalarType().isByteSized())
8159       return false;
8160     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8161     unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8162     unsigned DstByte = DstIdx * NumBytesPerElt;
8163     MinBitsPerElt =
8164         std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8165 
8166     // Create 'identity' byte level shuffle mask and then add inserted bytes.
8167     if (Opcode == ISD::SCALAR_TO_VECTOR) {
8168       Ops.push_back(SrcVec);
8169       Mask.append(NumSizeInBytes, SM_SentinelUndef);
8170     } else {
8171       Ops.push_back(SrcVec);
8172       Ops.push_back(N.getOperand(0));
8173       for (int i = 0; i != (int)NumSizeInBytes; ++i)
8174         Mask.push_back(NumSizeInBytes + i);
8175     }
8176 
8177     unsigned MinBytesPerElts = MinBitsPerElt / 8;
8178     MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8179     for (unsigned i = 0; i != MinBytesPerElts; ++i)
8180       Mask[DstByte + i] = SrcByte + i;
8181     for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8182       Mask[DstByte + i] = SM_SentinelZero;
8183     return true;
8184   }
8185   case X86ISD::PACKSS:
8186   case X86ISD::PACKUS: {
8187     SDValue N0 = N.getOperand(0);
8188     SDValue N1 = N.getOperand(1);
8189     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
8190            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
8191            "Unexpected input value type");
8192 
8193     APInt EltsLHS, EltsRHS;
8194     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8195 
8196     // If we know input saturation won't happen (or we don't care for particular
8197     // lanes), we can treat this as a truncation shuffle.
8198     bool Offset0 = false, Offset1 = false;
8199     if (Opcode == X86ISD::PACKSS) {
8200       if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8201            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8202           (!(N1.isUndef() || EltsRHS.isZero()) &&
8203            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8204         return false;
8205       // We can't easily fold ASHR into a shuffle, but if it was feeding a
8206       // PACKSS then it was likely being used for sign-extension for a
8207       // truncation, so just peek through and adjust the mask accordingly.
8208       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8209           N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8210         Offset0 = true;
8211         N0 = N0.getOperand(0);
8212       }
8213       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8214           N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8215         Offset1 = true;
8216         N1 = N1.getOperand(0);
8217       }
8218     } else {
8219       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8220       if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8221            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8222           (!(N1.isUndef() || EltsRHS.isZero()) &&
8223            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8224         return false;
8225     }
8226 
8227     bool IsUnary = (N0 == N1);
8228 
8229     Ops.push_back(N0);
8230     if (!IsUnary)
8231       Ops.push_back(N1);
8232 
8233     createPackShuffleMask(VT, Mask, IsUnary);
8234 
8235     if (Offset0 || Offset1) {
8236       for (int &M : Mask)
8237         if ((Offset0 && isInRange(M, 0, NumElts)) ||
8238             (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8239           ++M;
8240     }
8241     return true;
8242   }
8243   case X86ISD::VTRUNC: {
8244     SDValue Src = N.getOperand(0);
8245     EVT SrcVT = Src.getValueType();
8246     // Truncated source must be a simple vector.
8247     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8248         (SrcVT.getScalarSizeInBits() % 8) != 0)
8249       return false;
8250     unsigned NumSrcElts = SrcVT.getVectorNumElements();
8251     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8252     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8253     assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
8254     for (unsigned i = 0; i != NumSrcElts; ++i)
8255       Mask.push_back(i * Scale);
8256     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8257     Ops.push_back(Src);
8258     return true;
8259   }
8260   case X86ISD::VSHLI:
8261   case X86ISD::VSRLI: {
8262     uint64_t ShiftVal = N.getConstantOperandVal(1);
8263     // Out of range bit shifts are guaranteed to be zero.
8264     if (NumBitsPerElt <= ShiftVal) {
8265       Mask.append(NumElts, SM_SentinelZero);
8266       return true;
8267     }
8268 
8269     // We can only decode 'whole byte' bit shifts as shuffles.
8270     if ((ShiftVal % 8) != 0)
8271       break;
8272 
8273     uint64_t ByteShift = ShiftVal / 8;
8274     Ops.push_back(N.getOperand(0));
8275 
8276     // Clear mask to all zeros and insert the shifted byte indices.
8277     Mask.append(NumSizeInBytes, SM_SentinelZero);
8278 
8279     if (X86ISD::VSHLI == Opcode) {
8280       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8281         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8282           Mask[i + j] = i + j - ByteShift;
8283     } else {
8284       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8285         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8286           Mask[i + j - ByteShift] = i + j;
8287     }
8288     return true;
8289   }
8290   case X86ISD::VROTLI:
8291   case X86ISD::VROTRI: {
8292     // We can only decode 'whole byte' bit rotates as shuffles.
8293     uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8294     if ((RotateVal % 8) != 0)
8295       return false;
8296     Ops.push_back(N.getOperand(0));
8297     int Offset = RotateVal / 8;
8298     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8299     for (int i = 0; i != (int)NumElts; ++i) {
8300       int BaseIdx = i * NumBytesPerElt;
8301       for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8302         Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8303       }
8304     }
8305     return true;
8306   }
8307   case X86ISD::VBROADCAST: {
8308     SDValue Src = N.getOperand(0);
8309     if (!Src.getSimpleValueType().isVector()) {
8310       if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8311           !isNullConstant(Src.getOperand(1)) ||
8312           Src.getOperand(0).getValueType().getScalarType() !=
8313               VT.getScalarType())
8314         return false;
8315       Src = Src.getOperand(0);
8316     }
8317     Ops.push_back(Src);
8318     Mask.append(NumElts, 0);
8319     return true;
8320   }
8321   case ISD::ZERO_EXTEND:
8322   case ISD::ANY_EXTEND:
8323   case ISD::ZERO_EXTEND_VECTOR_INREG:
8324   case ISD::ANY_EXTEND_VECTOR_INREG: {
8325     SDValue Src = N.getOperand(0);
8326     EVT SrcVT = Src.getValueType();
8327 
8328     // Extended source must be a simple vector.
8329     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8330         (SrcVT.getScalarSizeInBits() % 8) != 0)
8331       return false;
8332 
8333     bool IsAnyExtend =
8334         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8335     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8336                          IsAnyExtend, Mask);
8337     Ops.push_back(Src);
8338     return true;
8339   }
8340   }
8341 
8342   return false;
8343 }
8344 
8345 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8346 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8347                                               SmallVectorImpl<int> &Mask) {
8348   int MaskWidth = Mask.size();
8349   SmallVector<SDValue, 16> UsedInputs;
8350   for (int i = 0, e = Inputs.size(); i < e; ++i) {
8351     int lo = UsedInputs.size() * MaskWidth;
8352     int hi = lo + MaskWidth;
8353 
8354     // Strip UNDEF input usage.
8355     if (Inputs[i].isUndef())
8356       for (int &M : Mask)
8357         if ((lo <= M) && (M < hi))
8358           M = SM_SentinelUndef;
8359 
8360     // Check for unused inputs.
8361     if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8362       for (int &M : Mask)
8363         if (lo <= M)
8364           M -= MaskWidth;
8365       continue;
8366     }
8367 
8368     // Check for repeated inputs.
8369     bool IsRepeat = false;
8370     for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8371       if (UsedInputs[j] != Inputs[i])
8372         continue;
8373       for (int &M : Mask)
8374         if (lo <= M)
8375           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8376       IsRepeat = true;
8377       break;
8378     }
8379     if (IsRepeat)
8380       continue;
8381 
8382     UsedInputs.push_back(Inputs[i]);
8383   }
8384   Inputs = UsedInputs;
8385 }
8386 
8387 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8388 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8389 /// Returns true if the target shuffle mask was decoded.
8390 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8391                                    SmallVectorImpl<SDValue> &Inputs,
8392                                    SmallVectorImpl<int> &Mask,
8393                                    APInt &KnownUndef, APInt &KnownZero,
8394                                    const SelectionDAG &DAG, unsigned Depth,
8395                                    bool ResolveKnownElts) {
8396   if (Depth >= SelectionDAG::MaxRecursionDepth)
8397     return false; // Limit search depth.
8398 
8399   EVT VT = Op.getValueType();
8400   if (!VT.isSimple() || !VT.isVector())
8401     return false;
8402 
8403   if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8404     if (ResolveKnownElts)
8405       resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8406     return true;
8407   }
8408   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8409                          ResolveKnownElts)) {
8410     resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8411     return true;
8412   }
8413   return false;
8414 }
8415 
8416 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8417                                    SmallVectorImpl<int> &Mask,
8418                                    const SelectionDAG &DAG, unsigned Depth = 0,
8419                                    bool ResolveKnownElts = true) {
8420   EVT VT = Op.getValueType();
8421   if (!VT.isSimple() || !VT.isVector())
8422     return false;
8423 
8424   APInt KnownUndef, KnownZero;
8425   unsigned NumElts = Op.getValueType().getVectorNumElements();
8426   APInt DemandedElts = APInt::getAllOnes(NumElts);
8427   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8428                                 KnownZero, DAG, Depth, ResolveKnownElts);
8429 }
8430 
8431 // Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8432 static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8433                                  EVT MemVT, MemSDNode *Mem, unsigned Offset,
8434                                  SelectionDAG &DAG) {
8435   assert((Opcode == X86ISD::VBROADCAST_LOAD ||
8436           Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
8437          "Unknown broadcast load type");
8438 
8439   // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8440   if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8441     return SDValue();
8442 
8443   SDValue Ptr =
8444       DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8445   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8446   SDValue Ops[] = {Mem->getChain(), Ptr};
8447   SDValue BcstLd = DAG.getMemIntrinsicNode(
8448       Opcode, DL, Tys, Ops, MemVT,
8449       DAG.getMachineFunction().getMachineMemOperand(
8450           Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8451   DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8452   return BcstLd;
8453 }
8454 
8455 /// Returns the scalar element that will make up the i'th
8456 /// element of the result of the vector shuffle.
8457 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8458                                    SelectionDAG &DAG, unsigned Depth) {
8459   if (Depth >= SelectionDAG::MaxRecursionDepth)
8460     return SDValue(); // Limit search depth.
8461 
8462   EVT VT = Op.getValueType();
8463   unsigned Opcode = Op.getOpcode();
8464   unsigned NumElems = VT.getVectorNumElements();
8465 
8466   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8467   if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8468     int Elt = SV->getMaskElt(Index);
8469 
8470     if (Elt < 0)
8471       return DAG.getUNDEF(VT.getVectorElementType());
8472 
8473     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8474     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8475   }
8476 
8477   // Recurse into target specific vector shuffles to find scalars.
8478   if (isTargetShuffle(Opcode)) {
8479     MVT ShufVT = VT.getSimpleVT();
8480     MVT ShufSVT = ShufVT.getVectorElementType();
8481     int NumElems = (int)ShufVT.getVectorNumElements();
8482     SmallVector<int, 16> ShuffleMask;
8483     SmallVector<SDValue, 16> ShuffleOps;
8484     if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8485                               ShuffleMask))
8486       return SDValue();
8487 
8488     int Elt = ShuffleMask[Index];
8489     if (Elt == SM_SentinelZero)
8490       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8491                                  : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8492     if (Elt == SM_SentinelUndef)
8493       return DAG.getUNDEF(ShufSVT);
8494 
8495     assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
8496     SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8497     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8498   }
8499 
8500   // Recurse into insert_subvector base/sub vector to find scalars.
8501   if (Opcode == ISD::INSERT_SUBVECTOR) {
8502     SDValue Vec = Op.getOperand(0);
8503     SDValue Sub = Op.getOperand(1);
8504     uint64_t SubIdx = Op.getConstantOperandVal(2);
8505     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8506 
8507     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8508       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8509     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8510   }
8511 
8512   // Recurse into concat_vectors sub vector to find scalars.
8513   if (Opcode == ISD::CONCAT_VECTORS) {
8514     EVT SubVT = Op.getOperand(0).getValueType();
8515     unsigned NumSubElts = SubVT.getVectorNumElements();
8516     uint64_t SubIdx = Index / NumSubElts;
8517     uint64_t SubElt = Index % NumSubElts;
8518     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8519   }
8520 
8521   // Recurse into extract_subvector src vector to find scalars.
8522   if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8523     SDValue Src = Op.getOperand(0);
8524     uint64_t SrcIdx = Op.getConstantOperandVal(1);
8525     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8526   }
8527 
8528   // We only peek through bitcasts of the same vector width.
8529   if (Opcode == ISD::BITCAST) {
8530     SDValue Src = Op.getOperand(0);
8531     EVT SrcVT = Src.getValueType();
8532     if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8533       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8534     return SDValue();
8535   }
8536 
8537   // Actual nodes that may contain scalar elements
8538 
8539   // For insert_vector_elt - either return the index matching scalar or recurse
8540   // into the base vector.
8541   if (Opcode == ISD::INSERT_VECTOR_ELT &&
8542       isa<ConstantSDNode>(Op.getOperand(2))) {
8543     if (Op.getConstantOperandAPInt(2) == Index)
8544       return Op.getOperand(1);
8545     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8546   }
8547 
8548   if (Opcode == ISD::SCALAR_TO_VECTOR)
8549     return (Index == 0) ? Op.getOperand(0)
8550                         : DAG.getUNDEF(VT.getVectorElementType());
8551 
8552   if (Opcode == ISD::BUILD_VECTOR)
8553     return Op.getOperand(Index);
8554 
8555   return SDValue();
8556 }
8557 
8558 // Use PINSRB/PINSRW/PINSRD to create a build vector.
8559 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8560                                         unsigned NumNonZero, unsigned NumZero,
8561                                         SelectionDAG &DAG,
8562                                         const X86Subtarget &Subtarget) {
8563   MVT VT = Op.getSimpleValueType();
8564   unsigned NumElts = VT.getVectorNumElements();
8565   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
8566           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
8567          "Illegal vector insertion");
8568 
8569   SDLoc dl(Op);
8570   SDValue V;
8571   bool First = true;
8572 
8573   for (unsigned i = 0; i < NumElts; ++i) {
8574     bool IsNonZero = NonZeroMask[i];
8575     if (!IsNonZero)
8576       continue;
8577 
8578     // If the build vector contains zeros or our first insertion is not the
8579     // first index then insert into zero vector to break any register
8580     // dependency else use SCALAR_TO_VECTOR.
8581     if (First) {
8582       First = false;
8583       if (NumZero || 0 != i)
8584         V = getZeroVector(VT, Subtarget, DAG, dl);
8585       else {
8586         assert(0 == i && "Expected insertion into zero-index");
8587         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8588         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8589         V = DAG.getBitcast(VT, V);
8590         continue;
8591       }
8592     }
8593     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8594                     DAG.getIntPtrConstant(i, dl));
8595   }
8596 
8597   return V;
8598 }
8599 
8600 /// Custom lower build_vector of v16i8.
8601 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8602                                      unsigned NumNonZero, unsigned NumZero,
8603                                      SelectionDAG &DAG,
8604                                      const X86Subtarget &Subtarget) {
8605   if (NumNonZero > 8 && !Subtarget.hasSSE41())
8606     return SDValue();
8607 
8608   // SSE4.1 - use PINSRB to insert each byte directly.
8609   if (Subtarget.hasSSE41())
8610     return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8611                                     Subtarget);
8612 
8613   SDLoc dl(Op);
8614   SDValue V;
8615 
8616   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8617   for (unsigned i = 0; i < 16; i += 2) {
8618     bool ThisIsNonZero = NonZeroMask[i];
8619     bool NextIsNonZero = NonZeroMask[i + 1];
8620     if (!ThisIsNonZero && !NextIsNonZero)
8621       continue;
8622 
8623     // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8624     SDValue Elt;
8625     if (ThisIsNonZero) {
8626       if (NumZero || NextIsNonZero)
8627         Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8628       else
8629         Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8630     }
8631 
8632     if (NextIsNonZero) {
8633       SDValue NextElt = Op.getOperand(i + 1);
8634       if (i == 0 && NumZero)
8635         NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8636       else
8637         NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8638       NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8639                             DAG.getConstant(8, dl, MVT::i8));
8640       if (ThisIsNonZero)
8641         Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8642       else
8643         Elt = NextElt;
8644     }
8645 
8646     // If our first insertion is not the first index or zeros are needed, then
8647     // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8648     // elements undefined).
8649     if (!V) {
8650       if (i != 0 || NumZero)
8651         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8652       else {
8653         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8654         V = DAG.getBitcast(MVT::v8i16, V);
8655         continue;
8656       }
8657     }
8658     Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8659     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8660                     DAG.getIntPtrConstant(i / 2, dl));
8661   }
8662 
8663   return DAG.getBitcast(MVT::v16i8, V);
8664 }
8665 
8666 /// Custom lower build_vector of v8i16.
8667 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8668                                      unsigned NumNonZero, unsigned NumZero,
8669                                      SelectionDAG &DAG,
8670                                      const X86Subtarget &Subtarget) {
8671   if (NumNonZero > 4 && !Subtarget.hasSSE41())
8672     return SDValue();
8673 
8674   // Use PINSRW to insert each byte directly.
8675   return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8676                                   Subtarget);
8677 }
8678 
8679 /// Custom lower build_vector of v4i32 or v4f32.
8680 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8681                                      const X86Subtarget &Subtarget) {
8682   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8683   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8684   // Because we're creating a less complicated build vector here, we may enable
8685   // further folding of the MOVDDUP via shuffle transforms.
8686   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8687       Op.getOperand(0) == Op.getOperand(2) &&
8688       Op.getOperand(1) == Op.getOperand(3) &&
8689       Op.getOperand(0) != Op.getOperand(1)) {
8690     SDLoc DL(Op);
8691     MVT VT = Op.getSimpleValueType();
8692     MVT EltVT = VT.getVectorElementType();
8693     // Create a new build vector with the first 2 elements followed by undef
8694     // padding, bitcast to v2f64, duplicate, and bitcast back.
8695     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8696                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8697     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8698     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8699     return DAG.getBitcast(VT, Dup);
8700   }
8701 
8702   // Find all zeroable elements.
8703   std::bitset<4> Zeroable, Undefs;
8704   for (int i = 0; i < 4; ++i) {
8705     SDValue Elt = Op.getOperand(i);
8706     Undefs[i] = Elt.isUndef();
8707     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8708   }
8709   assert(Zeroable.size() - Zeroable.count() > 1 &&
8710          "We expect at least two non-zero elements!");
8711 
8712   // We only know how to deal with build_vector nodes where elements are either
8713   // zeroable or extract_vector_elt with constant index.
8714   SDValue FirstNonZero;
8715   unsigned FirstNonZeroIdx;
8716   for (unsigned i = 0; i < 4; ++i) {
8717     if (Zeroable[i])
8718       continue;
8719     SDValue Elt = Op.getOperand(i);
8720     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8721         !isa<ConstantSDNode>(Elt.getOperand(1)))
8722       return SDValue();
8723     // Make sure that this node is extracting from a 128-bit vector.
8724     MVT VT = Elt.getOperand(0).getSimpleValueType();
8725     if (!VT.is128BitVector())
8726       return SDValue();
8727     if (!FirstNonZero.getNode()) {
8728       FirstNonZero = Elt;
8729       FirstNonZeroIdx = i;
8730     }
8731   }
8732 
8733   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
8734   SDValue V1 = FirstNonZero.getOperand(0);
8735   MVT VT = V1.getSimpleValueType();
8736 
8737   // See if this build_vector can be lowered as a blend with zero.
8738   SDValue Elt;
8739   unsigned EltMaskIdx, EltIdx;
8740   int Mask[4];
8741   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8742     if (Zeroable[EltIdx]) {
8743       // The zero vector will be on the right hand side.
8744       Mask[EltIdx] = EltIdx+4;
8745       continue;
8746     }
8747 
8748     Elt = Op->getOperand(EltIdx);
8749     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8750     EltMaskIdx = Elt.getConstantOperandVal(1);
8751     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8752       break;
8753     Mask[EltIdx] = EltIdx;
8754   }
8755 
8756   if (EltIdx == 4) {
8757     // Let the shuffle legalizer deal with blend operations.
8758     SDValue VZeroOrUndef = (Zeroable == Undefs)
8759                                ? DAG.getUNDEF(VT)
8760                                : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8761     if (V1.getSimpleValueType() != VT)
8762       V1 = DAG.getBitcast(VT, V1);
8763     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8764   }
8765 
8766   // See if we can lower this build_vector to a INSERTPS.
8767   if (!Subtarget.hasSSE41())
8768     return SDValue();
8769 
8770   SDValue V2 = Elt.getOperand(0);
8771   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8772     V1 = SDValue();
8773 
8774   bool CanFold = true;
8775   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8776     if (Zeroable[i])
8777       continue;
8778 
8779     SDValue Current = Op->getOperand(i);
8780     SDValue SrcVector = Current->getOperand(0);
8781     if (!V1.getNode())
8782       V1 = SrcVector;
8783     CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8784   }
8785 
8786   if (!CanFold)
8787     return SDValue();
8788 
8789   assert(V1.getNode() && "Expected at least two non-zero elements!");
8790   if (V1.getSimpleValueType() != MVT::v4f32)
8791     V1 = DAG.getBitcast(MVT::v4f32, V1);
8792   if (V2.getSimpleValueType() != MVT::v4f32)
8793     V2 = DAG.getBitcast(MVT::v4f32, V2);
8794 
8795   // Ok, we can emit an INSERTPS instruction.
8796   unsigned ZMask = Zeroable.to_ulong();
8797 
8798   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8799   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8800   SDLoc DL(Op);
8801   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8802                                DAG.getIntPtrConstant(InsertPSMask, DL, true));
8803   return DAG.getBitcast(VT, Result);
8804 }
8805 
8806 /// Return a vector logical shift node.
8807 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8808                          SelectionDAG &DAG, const TargetLowering &TLI,
8809                          const SDLoc &dl) {
8810   assert(VT.is128BitVector() && "Unknown type for VShift");
8811   MVT ShVT = MVT::v16i8;
8812   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8813   SrcOp = DAG.getBitcast(ShVT, SrcOp);
8814   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
8815   SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8816   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8817 }
8818 
8819 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8820                                       SelectionDAG &DAG) {
8821 
8822   // Check if the scalar load can be widened into a vector load. And if
8823   // the address is "base + cst" see if the cst can be "absorbed" into
8824   // the shuffle mask.
8825   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8826     SDValue Ptr = LD->getBasePtr();
8827     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8828       return SDValue();
8829     EVT PVT = LD->getValueType(0);
8830     if (PVT != MVT::i32 && PVT != MVT::f32)
8831       return SDValue();
8832 
8833     int FI = -1;
8834     int64_t Offset = 0;
8835     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8836       FI = FINode->getIndex();
8837       Offset = 0;
8838     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8839                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8840       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8841       Offset = Ptr.getConstantOperandVal(1);
8842       Ptr = Ptr.getOperand(0);
8843     } else {
8844       return SDValue();
8845     }
8846 
8847     // FIXME: 256-bit vector instructions don't require a strict alignment,
8848     // improve this code to support it better.
8849     Align RequiredAlign(VT.getSizeInBits() / 8);
8850     SDValue Chain = LD->getChain();
8851     // Make sure the stack object alignment is at least 16 or 32.
8852     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8853     MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8854     if (!InferredAlign || *InferredAlign < RequiredAlign) {
8855       if (MFI.isFixedObjectIndex(FI)) {
8856         // Can't change the alignment. FIXME: It's possible to compute
8857         // the exact stack offset and reference FI + adjust offset instead.
8858         // If someone *really* cares about this. That's the way to implement it.
8859         return SDValue();
8860       } else {
8861         MFI.setObjectAlignment(FI, RequiredAlign);
8862       }
8863     }
8864 
8865     // (Offset % 16 or 32) must be multiple of 4. Then address is then
8866     // Ptr + (Offset & ~15).
8867     if (Offset < 0)
8868       return SDValue();
8869     if ((Offset % RequiredAlign.value()) & 3)
8870       return SDValue();
8871     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8872     if (StartOffset) {
8873       SDLoc DL(Ptr);
8874       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8875                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8876     }
8877 
8878     int EltNo = (Offset - StartOffset) >> 2;
8879     unsigned NumElems = VT.getVectorNumElements();
8880 
8881     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8882     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8883                              LD->getPointerInfo().getWithOffset(StartOffset));
8884 
8885     SmallVector<int, 8> Mask(NumElems, EltNo);
8886 
8887     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8888   }
8889 
8890   return SDValue();
8891 }
8892 
8893 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8894 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8895   if (ISD::isNON_EXTLoad(Elt.getNode())) {
8896     auto *BaseLd = cast<LoadSDNode>(Elt);
8897     if (!BaseLd->isSimple())
8898       return false;
8899     Ld = BaseLd;
8900     ByteOffset = 0;
8901     return true;
8902   }
8903 
8904   switch (Elt.getOpcode()) {
8905   case ISD::BITCAST:
8906   case ISD::TRUNCATE:
8907   case ISD::SCALAR_TO_VECTOR:
8908     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8909   case ISD::SRL:
8910     if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8911       uint64_t Amt = AmtC->getZExtValue();
8912       if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8913         ByteOffset += Amt / 8;
8914         return true;
8915       }
8916     }
8917     break;
8918   case ISD::EXTRACT_VECTOR_ELT:
8919     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8920       SDValue Src = Elt.getOperand(0);
8921       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8922       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8923       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8924           findEltLoadSrc(Src, Ld, ByteOffset)) {
8925         uint64_t Idx = IdxC->getZExtValue();
8926         ByteOffset += Idx * (SrcSizeInBits / 8);
8927         return true;
8928       }
8929     }
8930     break;
8931   }
8932 
8933   return false;
8934 }
8935 
8936 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8937 /// elements can be replaced by a single large load which has the same value as
8938 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8939 ///
8940 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8941 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8942                                         const SDLoc &DL, SelectionDAG &DAG,
8943                                         const X86Subtarget &Subtarget,
8944                                         bool IsAfterLegalize) {
8945   if ((VT.getScalarSizeInBits() % 8) != 0)
8946     return SDValue();
8947 
8948   unsigned NumElems = Elts.size();
8949 
8950   int LastLoadedElt = -1;
8951   APInt LoadMask = APInt::getZero(NumElems);
8952   APInt ZeroMask = APInt::getZero(NumElems);
8953   APInt UndefMask = APInt::getZero(NumElems);
8954 
8955   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8956   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8957 
8958   // For each element in the initializer, see if we've found a load, zero or an
8959   // undef.
8960   for (unsigned i = 0; i < NumElems; ++i) {
8961     SDValue Elt = peekThroughBitcasts(Elts[i]);
8962     if (!Elt.getNode())
8963       return SDValue();
8964     if (Elt.isUndef()) {
8965       UndefMask.setBit(i);
8966       continue;
8967     }
8968     if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8969       ZeroMask.setBit(i);
8970       continue;
8971     }
8972 
8973     // Each loaded element must be the correct fractional portion of the
8974     // requested vector load.
8975     unsigned EltSizeInBits = Elt.getValueSizeInBits();
8976     if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8977       return SDValue();
8978 
8979     if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8980       return SDValue();
8981     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8982     if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8983       return SDValue();
8984 
8985     LoadMask.setBit(i);
8986     LastLoadedElt = i;
8987   }
8988   assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
8989           LoadMask.countPopulation()) == NumElems &&
8990          "Incomplete element masks");
8991 
8992   // Handle Special Cases - all undef or undef/zero.
8993   if (UndefMask.countPopulation() == NumElems)
8994     return DAG.getUNDEF(VT);
8995   if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8996     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8997                           : DAG.getConstantFP(0.0, DL, VT);
8998 
8999   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9000   int FirstLoadedElt = LoadMask.countTrailingZeros();
9001   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9002   EVT EltBaseVT = EltBase.getValueType();
9003   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
9004          "Register/Memory size mismatch");
9005   LoadSDNode *LDBase = Loads[FirstLoadedElt];
9006   assert(LDBase && "Did not find base load for merging consecutive loads");
9007   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9008   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9009   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9010   int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9011   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
9012 
9013   // TODO: Support offsetting the base load.
9014   if (ByteOffsets[FirstLoadedElt] != 0)
9015     return SDValue();
9016 
9017   // Check to see if the element's load is consecutive to the base load
9018   // or offset from a previous (already checked) load.
9019   auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9020     LoadSDNode *Ld = Loads[EltIdx];
9021     int64_t ByteOffset = ByteOffsets[EltIdx];
9022     if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9023       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9024       return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9025               Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9026     }
9027     return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9028                                               EltIdx - FirstLoadedElt);
9029   };
9030 
9031   // Consecutive loads can contain UNDEFS but not ZERO elements.
9032   // Consecutive loads with UNDEFs and ZEROs elements require a
9033   // an additional shuffle stage to clear the ZERO elements.
9034   bool IsConsecutiveLoad = true;
9035   bool IsConsecutiveLoadWithZeros = true;
9036   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9037     if (LoadMask[i]) {
9038       if (!CheckConsecutiveLoad(LDBase, i)) {
9039         IsConsecutiveLoad = false;
9040         IsConsecutiveLoadWithZeros = false;
9041         break;
9042       }
9043     } else if (ZeroMask[i]) {
9044       IsConsecutiveLoad = false;
9045     }
9046   }
9047 
9048   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9049     auto MMOFlags = LDBase->getMemOperand()->getFlags();
9050     assert(LDBase->isSimple() &&
9051            "Cannot merge volatile or atomic loads.");
9052     SDValue NewLd =
9053         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9054                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9055                     MMOFlags);
9056     for (auto *LD : Loads)
9057       if (LD)
9058         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9059     return NewLd;
9060   };
9061 
9062   // Check if the base load is entirely dereferenceable.
9063   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9064       VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9065 
9066   // LOAD - all consecutive load/undefs (must start/end with a load or be
9067   // entirely dereferenceable). If we have found an entire vector of loads and
9068   // undefs, then return a large load of the entire vector width starting at the
9069   // base pointer. If the vector contains zeros, then attempt to shuffle those
9070   // elements.
9071   if (FirstLoadedElt == 0 &&
9072       (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9073       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9074     if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9075       return SDValue();
9076 
9077     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9078     // will lower to regular temporal loads and use the cache.
9079     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
9080         VT.is256BitVector() && !Subtarget.hasInt256())
9081       return SDValue();
9082 
9083     if (NumElems == 1)
9084       return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9085 
9086     if (!ZeroMask)
9087       return CreateLoad(VT, LDBase);
9088 
9089     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9090     // vector and a zero vector to clear out the zero elements.
9091     if (!IsAfterLegalize && VT.isVector()) {
9092       unsigned NumMaskElts = VT.getVectorNumElements();
9093       if ((NumMaskElts % NumElems) == 0) {
9094         unsigned Scale = NumMaskElts / NumElems;
9095         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9096         for (unsigned i = 0; i < NumElems; ++i) {
9097           if (UndefMask[i])
9098             continue;
9099           int Offset = ZeroMask[i] ? NumMaskElts : 0;
9100           for (unsigned j = 0; j != Scale; ++j)
9101             ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9102         }
9103         SDValue V = CreateLoad(VT, LDBase);
9104         SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9105                                    : DAG.getConstantFP(0.0, DL, VT);
9106         return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9107       }
9108     }
9109   }
9110 
9111   // If the upper half of a ymm/zmm load is undef then just load the lower half.
9112   if (VT.is256BitVector() || VT.is512BitVector()) {
9113     unsigned HalfNumElems = NumElems / 2;
9114     if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9115       EVT HalfVT =
9116           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9117       SDValue HalfLD =
9118           EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9119                                    DAG, Subtarget, IsAfterLegalize);
9120       if (HalfLD)
9121         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9122                            HalfLD, DAG.getIntPtrConstant(0, DL));
9123     }
9124   }
9125 
9126   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9127   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9128       ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9129        LoadSizeInBits == 64) &&
9130       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9131     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9132                                       : MVT::getIntegerVT(LoadSizeInBits);
9133     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9134     // Allow v4f32 on SSE1 only targets.
9135     // FIXME: Add more isel patterns so we can just use VT directly.
9136     if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9137       VecVT = MVT::v4f32;
9138     if (TLI.isTypeLegal(VecVT)) {
9139       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9140       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9141       SDValue ResNode = DAG.getMemIntrinsicNode(
9142           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9143           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9144       for (auto *LD : Loads)
9145         if (LD)
9146           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9147       return DAG.getBitcast(VT, ResNode);
9148     }
9149   }
9150 
9151   // BROADCAST - match the smallest possible repetition pattern, load that
9152   // scalar/subvector element and then broadcast to the entire vector.
9153   if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9154       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9155     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9156       unsigned RepeatSize = SubElems * BaseSizeInBits;
9157       unsigned ScalarSize = std::min(RepeatSize, 64u);
9158       if (!Subtarget.hasAVX2() && ScalarSize < 32)
9159         continue;
9160 
9161       // Don't attempt a 1:N subvector broadcast - it should be caught by
9162       // combineConcatVectorOps, else will cause infinite loops.
9163       if (RepeatSize > ScalarSize && SubElems == 1)
9164         continue;
9165 
9166       bool Match = true;
9167       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9168       for (unsigned i = 0; i != NumElems && Match; ++i) {
9169         if (!LoadMask[i])
9170           continue;
9171         SDValue Elt = peekThroughBitcasts(Elts[i]);
9172         if (RepeatedLoads[i % SubElems].isUndef())
9173           RepeatedLoads[i % SubElems] = Elt;
9174         else
9175           Match &= (RepeatedLoads[i % SubElems] == Elt);
9176       }
9177 
9178       // We must have loads at both ends of the repetition.
9179       Match &= !RepeatedLoads.front().isUndef();
9180       Match &= !RepeatedLoads.back().isUndef();
9181       if (!Match)
9182         continue;
9183 
9184       EVT RepeatVT =
9185           VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9186               ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9187               : EVT::getFloatingPointVT(ScalarSize);
9188       if (RepeatSize > ScalarSize)
9189         RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9190                                     RepeatSize / ScalarSize);
9191       EVT BroadcastVT =
9192           EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9193                            VT.getSizeInBits() / ScalarSize);
9194       if (TLI.isTypeLegal(BroadcastVT)) {
9195         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9196                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9197           SDValue Broadcast = RepeatLoad;
9198           if (RepeatSize > ScalarSize) {
9199             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9200               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9201           } else {
9202             if (!Subtarget.hasAVX2() &&
9203                 !X86::mayFoldLoadIntoBroadcastFromMem(
9204                     RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9205                     Subtarget,
9206                     /*AssumeSingleUse=*/true))
9207               return SDValue();
9208             Broadcast =
9209                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9210           }
9211           return DAG.getBitcast(VT, Broadcast);
9212         }
9213       }
9214     }
9215   }
9216 
9217   return SDValue();
9218 }
9219 
9220 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9221 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9222 // are consecutive, non-overlapping, and in the right order.
9223 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9224                                          SelectionDAG &DAG,
9225                                          const X86Subtarget &Subtarget,
9226                                          bool IsAfterLegalize) {
9227   SmallVector<SDValue, 64> Elts;
9228   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9229     if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9230       Elts.push_back(Elt);
9231       continue;
9232     }
9233     return SDValue();
9234   }
9235   assert(Elts.size() == VT.getVectorNumElements());
9236   return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9237                                   IsAfterLegalize);
9238 }
9239 
9240 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9241                                    unsigned SplatBitSize, LLVMContext &C) {
9242   unsigned ScalarSize = VT.getScalarSizeInBits();
9243   unsigned NumElm = SplatBitSize / ScalarSize;
9244 
9245   SmallVector<Constant *, 32> ConstantVec;
9246   for (unsigned i = 0; i < NumElm; i++) {
9247     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9248     Constant *Const;
9249     if (VT.isFloatingPoint()) {
9250       if (ScalarSize == 16) {
9251         Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9252       } else if (ScalarSize == 32) {
9253         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9254       } else {
9255         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
9256         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9257       }
9258     } else
9259       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9260     ConstantVec.push_back(Const);
9261   }
9262   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9263 }
9264 
9265 static bool isFoldableUseOfShuffle(SDNode *N) {
9266   for (auto *U : N->uses()) {
9267     unsigned Opc = U->getOpcode();
9268     // VPERMV/VPERMV3 shuffles can never fold their index operands.
9269     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9270       return false;
9271     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9272       return false;
9273     if (isTargetShuffle(Opc))
9274       return true;
9275     if (Opc == ISD::BITCAST) // Ignore bitcasts
9276       return isFoldableUseOfShuffle(U);
9277     if (N->hasOneUse()) {
9278       // TODO, there may be some general way to know if a SDNode can
9279       // be folded. We now only know whether an MI is foldable.
9280       if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9281         return false;
9282       return true;
9283     }
9284   }
9285   return false;
9286 }
9287 
9288 /// Attempt to use the vbroadcast instruction to generate a splat value
9289 /// from a splat BUILD_VECTOR which uses:
9290 ///  a. A single scalar load, or a constant.
9291 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9292 ///
9293 /// The VBROADCAST node is returned when a pattern is found,
9294 /// or SDValue() otherwise.
9295 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9296                                            const X86Subtarget &Subtarget,
9297                                            SelectionDAG &DAG) {
9298   // VBROADCAST requires AVX.
9299   // TODO: Splats could be generated for non-AVX CPUs using SSE
9300   // instructions, but there's less potential gain for only 128-bit vectors.
9301   if (!Subtarget.hasAVX())
9302     return SDValue();
9303 
9304   MVT VT = BVOp->getSimpleValueType(0);
9305   unsigned NumElts = VT.getVectorNumElements();
9306   SDLoc dl(BVOp);
9307 
9308   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
9309          "Unsupported vector type for broadcast.");
9310 
9311   // See if the build vector is a repeating sequence of scalars (inc. splat).
9312   SDValue Ld;
9313   BitVector UndefElements;
9314   SmallVector<SDValue, 16> Sequence;
9315   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9316     assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
9317     if (Sequence.size() == 1)
9318       Ld = Sequence[0];
9319   }
9320 
9321   // Attempt to use VBROADCASTM
9322   // From this pattern:
9323   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9324   // b. t1 = (build_vector t0 t0)
9325   //
9326   // Create (VBROADCASTM v2i1 X)
9327   if (!Sequence.empty() && Subtarget.hasCDI()) {
9328     // If not a splat, are the upper sequence values zeroable?
9329     unsigned SeqLen = Sequence.size();
9330     bool UpperZeroOrUndef =
9331         SeqLen == 1 ||
9332         llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
9333           return !V || V.isUndef() || isNullConstant(V);
9334         });
9335     SDValue Op0 = Sequence[0];
9336     if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9337                              (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9338                               Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9339       SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9340                              ? Op0.getOperand(0)
9341                              : Op0.getOperand(0).getOperand(0);
9342       MVT MaskVT = BOperand.getSimpleValueType();
9343       MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9344       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
9345           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9346         MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9347         if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9348           unsigned Scale = 512 / VT.getSizeInBits();
9349           BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9350         }
9351         SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9352         if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9353           Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9354         return DAG.getBitcast(VT, Bcst);
9355       }
9356     }
9357   }
9358 
9359   unsigned NumUndefElts = UndefElements.count();
9360   if (!Ld || (NumElts - NumUndefElts) <= 1) {
9361     APInt SplatValue, Undef;
9362     unsigned SplatBitSize;
9363     bool HasUndef;
9364     // Check if this is a repeated constant pattern suitable for broadcasting.
9365     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9366         SplatBitSize > VT.getScalarSizeInBits() &&
9367         SplatBitSize < VT.getSizeInBits()) {
9368       // Avoid replacing with broadcast when it's a use of a shuffle
9369       // instruction to preserve the present custom lowering of shuffles.
9370       if (isFoldableUseOfShuffle(BVOp))
9371         return SDValue();
9372       // replace BUILD_VECTOR with broadcast of the repeated constants.
9373       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9374       LLVMContext *Ctx = DAG.getContext();
9375       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9376       if (Subtarget.hasAVX()) {
9377         if (SplatBitSize == 32 || SplatBitSize == 64 ||
9378             (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9379           // Splatted value can fit in one INTEGER constant in constant pool.
9380           // Load the constant and broadcast it.
9381           MVT CVT = MVT::getIntegerVT(SplatBitSize);
9382           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9383           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9384           SDValue CP = DAG.getConstantPool(C, PVT);
9385           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9386 
9387           Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9388           SDVTList Tys =
9389               DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9390           SDValue Ops[] = {DAG.getEntryNode(), CP};
9391           MachinePointerInfo MPI =
9392               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9393           SDValue Brdcst = DAG.getMemIntrinsicNode(
9394               X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9395               MachineMemOperand::MOLoad);
9396           return DAG.getBitcast(VT, Brdcst);
9397         }
9398         if (SplatBitSize > 64) {
9399           // Load the vector of constants and broadcast it.
9400           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9401                                              *Ctx);
9402           SDValue VCP = DAG.getConstantPool(VecC, PVT);
9403           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9404           MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9405           Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9406           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9407           SDValue Ops[] = {DAG.getEntryNode(), VCP};
9408           MachinePointerInfo MPI =
9409               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9410           return DAG.getMemIntrinsicNode(
9411               X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9412               MachineMemOperand::MOLoad);
9413         }
9414       }
9415     }
9416 
9417     // If we are moving a scalar into a vector (Ld must be set and all elements
9418     // but 1 are undef) and that operation is not obviously supported by
9419     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9420     // That's better than general shuffling and may eliminate a load to GPR and
9421     // move from scalar to vector register.
9422     if (!Ld || NumElts - NumUndefElts != 1)
9423       return SDValue();
9424     unsigned ScalarSize = Ld.getValueSizeInBits();
9425     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9426       return SDValue();
9427   }
9428 
9429   bool ConstSplatVal =
9430       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9431   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9432 
9433   // TODO: Handle broadcasts of non-constant sequences.
9434 
9435   // Make sure that all of the users of a non-constant load are from the
9436   // BUILD_VECTOR node.
9437   // FIXME: Is the use count needed for non-constant, non-load case?
9438   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9439     return SDValue();
9440 
9441   unsigned ScalarSize = Ld.getValueSizeInBits();
9442   bool IsGE256 = (VT.getSizeInBits() >= 256);
9443 
9444   // When optimizing for size, generate up to 5 extra bytes for a broadcast
9445   // instruction to save 8 or more bytes of constant pool data.
9446   // TODO: If multiple splats are generated to load the same constant,
9447   // it may be detrimental to overall size. There needs to be a way to detect
9448   // that condition to know if this is truly a size win.
9449   bool OptForSize = DAG.shouldOptForSize();
9450 
9451   // Handle broadcasting a single constant scalar from the constant pool
9452   // into a vector.
9453   // On Sandybridge (no AVX2), it is still better to load a constant vector
9454   // from the constant pool and not to broadcast it from a scalar.
9455   // But override that restriction when optimizing for size.
9456   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9457   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9458     EVT CVT = Ld.getValueType();
9459     assert(!CVT.isVector() && "Must not broadcast a vector type");
9460 
9461     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9462     // For size optimization, also splat v2f64 and v2i64, and for size opt
9463     // with AVX2, also splat i8 and i16.
9464     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9465     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9466         (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
9467         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9468       const Constant *C = nullptr;
9469       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9470         C = CI->getConstantIntValue();
9471       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9472         C = CF->getConstantFPValue();
9473 
9474       assert(C && "Invalid constant type");
9475 
9476       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9477       SDValue CP =
9478           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9479       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9480 
9481       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9482       SDValue Ops[] = {DAG.getEntryNode(), CP};
9483       MachinePointerInfo MPI =
9484           MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9485       return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9486                                      MPI, Alignment, MachineMemOperand::MOLoad);
9487     }
9488   }
9489 
9490   // Handle AVX2 in-register broadcasts.
9491   if (!IsLoad && Subtarget.hasInt256() &&
9492       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9493     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9494 
9495   // The scalar source must be a normal load.
9496   if (!IsLoad)
9497     return SDValue();
9498 
9499   // Make sure the non-chain result is only used by this build vector.
9500   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9501     return SDValue();
9502 
9503   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9504       (Subtarget.hasVLX() && ScalarSize == 64)) {
9505     auto *LN = cast<LoadSDNode>(Ld);
9506     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9507     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9508     SDValue BCast =
9509         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9510                                 LN->getMemoryVT(), LN->getMemOperand());
9511     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9512     return BCast;
9513   }
9514 
9515   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9516   // double since there is no vbroadcastsd xmm
9517   if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9518       (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9519     auto *LN = cast<LoadSDNode>(Ld);
9520     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9521     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9522     SDValue BCast =
9523         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9524                                 LN->getMemoryVT(), LN->getMemOperand());
9525     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9526     return BCast;
9527   }
9528 
9529   if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9530     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9531 
9532   // Unsupported broadcast.
9533   return SDValue();
9534 }
9535 
9536 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
9537 /// underlying vector and index.
9538 ///
9539 /// Modifies \p ExtractedFromVec to the real vector and returns the real
9540 /// index.
9541 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9542                                          SDValue ExtIdx) {
9543   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9544   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9545     return Idx;
9546 
9547   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9548   // lowered this:
9549   //   (extract_vector_elt (v8f32 %1), Constant<6>)
9550   // to:
9551   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
9552   //                           (extract_subvector (v8f32 %0), Constant<4>),
9553   //                           undef)
9554   //                       Constant<0>)
9555   // In this case the vector is the extract_subvector expression and the index
9556   // is 2, as specified by the shuffle.
9557   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9558   SDValue ShuffleVec = SVOp->getOperand(0);
9559   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9560   assert(ShuffleVecVT.getVectorElementType() ==
9561          ExtractedFromVec.getSimpleValueType().getVectorElementType());
9562 
9563   int ShuffleIdx = SVOp->getMaskElt(Idx);
9564   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9565     ExtractedFromVec = ShuffleVec;
9566     return ShuffleIdx;
9567   }
9568   return Idx;
9569 }
9570 
9571 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9572   MVT VT = Op.getSimpleValueType();
9573 
9574   // Skip if insert_vec_elt is not supported.
9575   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9576   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9577     return SDValue();
9578 
9579   SDLoc DL(Op);
9580   unsigned NumElems = Op.getNumOperands();
9581 
9582   SDValue VecIn1;
9583   SDValue VecIn2;
9584   SmallVector<unsigned, 4> InsertIndices;
9585   SmallVector<int, 8> Mask(NumElems, -1);
9586 
9587   for (unsigned i = 0; i != NumElems; ++i) {
9588     unsigned Opc = Op.getOperand(i).getOpcode();
9589 
9590     if (Opc == ISD::UNDEF)
9591       continue;
9592 
9593     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9594       // Quit if more than 1 elements need inserting.
9595       if (InsertIndices.size() > 1)
9596         return SDValue();
9597 
9598       InsertIndices.push_back(i);
9599       continue;
9600     }
9601 
9602     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9603     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9604 
9605     // Quit if non-constant index.
9606     if (!isa<ConstantSDNode>(ExtIdx))
9607       return SDValue();
9608     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9609 
9610     // Quit if extracted from vector of different type.
9611     if (ExtractedFromVec.getValueType() != VT)
9612       return SDValue();
9613 
9614     if (!VecIn1.getNode())
9615       VecIn1 = ExtractedFromVec;
9616     else if (VecIn1 != ExtractedFromVec) {
9617       if (!VecIn2.getNode())
9618         VecIn2 = ExtractedFromVec;
9619       else if (VecIn2 != ExtractedFromVec)
9620         // Quit if more than 2 vectors to shuffle
9621         return SDValue();
9622     }
9623 
9624     if (ExtractedFromVec == VecIn1)
9625       Mask[i] = Idx;
9626     else if (ExtractedFromVec == VecIn2)
9627       Mask[i] = Idx + NumElems;
9628   }
9629 
9630   if (!VecIn1.getNode())
9631     return SDValue();
9632 
9633   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9634   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9635 
9636   for (unsigned Idx : InsertIndices)
9637     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9638                      DAG.getIntPtrConstant(Idx, DL));
9639 
9640   return NV;
9641 }
9642 
9643 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9644 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9645                                      const X86Subtarget &Subtarget) {
9646 
9647   MVT VT = Op.getSimpleValueType();
9648   assert((VT.getVectorElementType() == MVT::i1) &&
9649          "Unexpected type in LowerBUILD_VECTORvXi1!");
9650 
9651   SDLoc dl(Op);
9652   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9653       ISD::isBuildVectorAllOnes(Op.getNode()))
9654     return Op;
9655 
9656   uint64_t Immediate = 0;
9657   SmallVector<unsigned, 16> NonConstIdx;
9658   bool IsSplat = true;
9659   bool HasConstElts = false;
9660   int SplatIdx = -1;
9661   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9662     SDValue In = Op.getOperand(idx);
9663     if (In.isUndef())
9664       continue;
9665     if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9666       Immediate |= (InC->getZExtValue() & 0x1) << idx;
9667       HasConstElts = true;
9668     } else {
9669       NonConstIdx.push_back(idx);
9670     }
9671     if (SplatIdx < 0)
9672       SplatIdx = idx;
9673     else if (In != Op.getOperand(SplatIdx))
9674       IsSplat = false;
9675   }
9676 
9677   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9678   if (IsSplat) {
9679     // The build_vector allows the scalar element to be larger than the vector
9680     // element type. We need to mask it to use as a condition unless we know
9681     // the upper bits are zero.
9682     // FIXME: Use computeKnownBits instead of checking specific opcode?
9683     SDValue Cond = Op.getOperand(SplatIdx);
9684     assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
9685     if (Cond.getOpcode() != ISD::SETCC)
9686       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9687                          DAG.getConstant(1, dl, MVT::i8));
9688 
9689     // Perform the select in the scalar domain so we can use cmov.
9690     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9691       SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9692                                      DAG.getAllOnesConstant(dl, MVT::i32),
9693                                      DAG.getConstant(0, dl, MVT::i32));
9694       Select = DAG.getBitcast(MVT::v32i1, Select);
9695       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9696     } else {
9697       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9698       SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9699                                      DAG.getAllOnesConstant(dl, ImmVT),
9700                                      DAG.getConstant(0, dl, ImmVT));
9701       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9702       Select = DAG.getBitcast(VecVT, Select);
9703       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9704                          DAG.getIntPtrConstant(0, dl));
9705     }
9706   }
9707 
9708   // insert elements one by one
9709   SDValue DstVec;
9710   if (HasConstElts) {
9711     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9712       SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9713       SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9714       ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9715       ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9716       DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9717     } else {
9718       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9719       SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9720       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9721       DstVec = DAG.getBitcast(VecVT, Imm);
9722       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9723                            DAG.getIntPtrConstant(0, dl));
9724     }
9725   } else
9726     DstVec = DAG.getUNDEF(VT);
9727 
9728   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9729     unsigned InsertIdx = NonConstIdx[i];
9730     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9731                          Op.getOperand(InsertIdx),
9732                          DAG.getIntPtrConstant(InsertIdx, dl));
9733   }
9734   return DstVec;
9735 }
9736 
9737 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
9738   switch (Opcode) {
9739   case X86ISD::PACKSS:
9740   case X86ISD::PACKUS:
9741   case X86ISD::FHADD:
9742   case X86ISD::FHSUB:
9743   case X86ISD::HADD:
9744   case X86ISD::HSUB:
9745     return true;
9746   }
9747   return false;
9748 }
9749 
9750 /// This is a helper function of LowerToHorizontalOp().
9751 /// This function checks that the build_vector \p N in input implements a
9752 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9753 /// may not match the layout of an x86 256-bit horizontal instruction.
9754 /// In other words, if this returns true, then some extraction/insertion will
9755 /// be required to produce a valid horizontal instruction.
9756 ///
9757 /// Parameter \p Opcode defines the kind of horizontal operation to match.
9758 /// For example, if \p Opcode is equal to ISD::ADD, then this function
9759 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9760 /// is equal to ISD::SUB, then this function checks if this is a horizontal
9761 /// arithmetic sub.
9762 ///
9763 /// This function only analyzes elements of \p N whose indices are
9764 /// in range [BaseIdx, LastIdx).
9765 ///
9766 /// TODO: This function was originally used to match both real and fake partial
9767 /// horizontal operations, but the index-matching logic is incorrect for that.
9768 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
9769 /// code because it is only used for partial h-op matching now?
9770 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9771                                   SelectionDAG &DAG,
9772                                   unsigned BaseIdx, unsigned LastIdx,
9773                                   SDValue &V0, SDValue &V1) {
9774   EVT VT = N->getValueType(0);
9775   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
9776   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
9777   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
9778          "Invalid Vector in input!");
9779 
9780   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9781   bool CanFold = true;
9782   unsigned ExpectedVExtractIdx = BaseIdx;
9783   unsigned NumElts = LastIdx - BaseIdx;
9784   V0 = DAG.getUNDEF(VT);
9785   V1 = DAG.getUNDEF(VT);
9786 
9787   // Check if N implements a horizontal binop.
9788   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9789     SDValue Op = N->getOperand(i + BaseIdx);
9790 
9791     // Skip UNDEFs.
9792     if (Op->isUndef()) {
9793       // Update the expected vector extract index.
9794       if (i * 2 == NumElts)
9795         ExpectedVExtractIdx = BaseIdx;
9796       ExpectedVExtractIdx += 2;
9797       continue;
9798     }
9799 
9800     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9801 
9802     if (!CanFold)
9803       break;
9804 
9805     SDValue Op0 = Op.getOperand(0);
9806     SDValue Op1 = Op.getOperand(1);
9807 
9808     // Try to match the following pattern:
9809     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9810     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9811         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9812         Op0.getOperand(0) == Op1.getOperand(0) &&
9813         isa<ConstantSDNode>(Op0.getOperand(1)) &&
9814         isa<ConstantSDNode>(Op1.getOperand(1)));
9815     if (!CanFold)
9816       break;
9817 
9818     unsigned I0 = Op0.getConstantOperandVal(1);
9819     unsigned I1 = Op1.getConstantOperandVal(1);
9820 
9821     if (i * 2 < NumElts) {
9822       if (V0.isUndef()) {
9823         V0 = Op0.getOperand(0);
9824         if (V0.getValueType() != VT)
9825           return false;
9826       }
9827     } else {
9828       if (V1.isUndef()) {
9829         V1 = Op0.getOperand(0);
9830         if (V1.getValueType() != VT)
9831           return false;
9832       }
9833       if (i * 2 == NumElts)
9834         ExpectedVExtractIdx = BaseIdx;
9835     }
9836 
9837     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9838     if (I0 == ExpectedVExtractIdx)
9839       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9840     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9841       // Try to match the following dag sequence:
9842       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9843       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9844     } else
9845       CanFold = false;
9846 
9847     ExpectedVExtractIdx += 2;
9848   }
9849 
9850   return CanFold;
9851 }
9852 
9853 /// Emit a sequence of two 128-bit horizontal add/sub followed by
9854 /// a concat_vector.
9855 ///
9856 /// This is a helper function of LowerToHorizontalOp().
9857 /// This function expects two 256-bit vectors called V0 and V1.
9858 /// At first, each vector is split into two separate 128-bit vectors.
9859 /// Then, the resulting 128-bit vectors are used to implement two
9860 /// horizontal binary operations.
9861 ///
9862 /// The kind of horizontal binary operation is defined by \p X86Opcode.
9863 ///
9864 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9865 /// the two new horizontal binop.
9866 /// When Mode is set, the first horizontal binop dag node would take as input
9867 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9868 /// horizontal binop dag node would take as input the lower 128-bit of V1
9869 /// and the upper 128-bit of V1.
9870 ///   Example:
9871 ///     HADD V0_LO, V0_HI
9872 ///     HADD V1_LO, V1_HI
9873 ///
9874 /// Otherwise, the first horizontal binop dag node takes as input the lower
9875 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9876 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9877 ///   Example:
9878 ///     HADD V0_LO, V1_LO
9879 ///     HADD V0_HI, V1_HI
9880 ///
9881 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9882 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9883 /// the upper 128-bits of the result.
9884 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9885                                      const SDLoc &DL, SelectionDAG &DAG,
9886                                      unsigned X86Opcode, bool Mode,
9887                                      bool isUndefLO, bool isUndefHI) {
9888   MVT VT = V0.getSimpleValueType();
9889   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
9890          "Invalid nodes in input!");
9891 
9892   unsigned NumElts = VT.getVectorNumElements();
9893   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9894   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9895   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9896   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9897   MVT NewVT = V0_LO.getSimpleValueType();
9898 
9899   SDValue LO = DAG.getUNDEF(NewVT);
9900   SDValue HI = DAG.getUNDEF(NewVT);
9901 
9902   if (Mode) {
9903     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9904     if (!isUndefLO && !V0->isUndef())
9905       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9906     if (!isUndefHI && !V1->isUndef())
9907       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9908   } else {
9909     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9910     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9911       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9912 
9913     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9914       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9915   }
9916 
9917   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9918 }
9919 
9920 /// Returns true iff \p BV builds a vector with the result equivalent to
9921 /// the result of ADDSUB/SUBADD operation.
9922 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9923 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9924 /// \p Opnd0 and \p Opnd1.
9925 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9926                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
9927                              SDValue &Opnd0, SDValue &Opnd1,
9928                              unsigned &NumExtracts,
9929                              bool &IsSubAdd) {
9930 
9931   MVT VT = BV->getSimpleValueType(0);
9932   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9933     return false;
9934 
9935   unsigned NumElts = VT.getVectorNumElements();
9936   SDValue InVec0 = DAG.getUNDEF(VT);
9937   SDValue InVec1 = DAG.getUNDEF(VT);
9938 
9939   NumExtracts = 0;
9940 
9941   // Odd-numbered elements in the input build vector are obtained from
9942   // adding/subtracting two integer/float elements.
9943   // Even-numbered elements in the input build vector are obtained from
9944   // subtracting/adding two integer/float elements.
9945   unsigned Opc[2] = {0, 0};
9946   for (unsigned i = 0, e = NumElts; i != e; ++i) {
9947     SDValue Op = BV->getOperand(i);
9948 
9949     // Skip 'undef' values.
9950     unsigned Opcode = Op.getOpcode();
9951     if (Opcode == ISD::UNDEF)
9952       continue;
9953 
9954     // Early exit if we found an unexpected opcode.
9955     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9956       return false;
9957 
9958     SDValue Op0 = Op.getOperand(0);
9959     SDValue Op1 = Op.getOperand(1);
9960 
9961     // Try to match the following pattern:
9962     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9963     // Early exit if we cannot match that sequence.
9964     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9965         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9966         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9967         Op0.getOperand(1) != Op1.getOperand(1))
9968       return false;
9969 
9970     unsigned I0 = Op0.getConstantOperandVal(1);
9971     if (I0 != i)
9972       return false;
9973 
9974     // We found a valid add/sub node, make sure its the same opcode as previous
9975     // elements for this parity.
9976     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9977       return false;
9978     Opc[i % 2] = Opcode;
9979 
9980     // Update InVec0 and InVec1.
9981     if (InVec0.isUndef()) {
9982       InVec0 = Op0.getOperand(0);
9983       if (InVec0.getSimpleValueType() != VT)
9984         return false;
9985     }
9986     if (InVec1.isUndef()) {
9987       InVec1 = Op1.getOperand(0);
9988       if (InVec1.getSimpleValueType() != VT)
9989         return false;
9990     }
9991 
9992     // Make sure that operands in input to each add/sub node always
9993     // come from a same pair of vectors.
9994     if (InVec0 != Op0.getOperand(0)) {
9995       if (Opcode == ISD::FSUB)
9996         return false;
9997 
9998       // FADD is commutable. Try to commute the operands
9999       // and then test again.
10000       std::swap(Op0, Op1);
10001       if (InVec0 != Op0.getOperand(0))
10002         return false;
10003     }
10004 
10005     if (InVec1 != Op1.getOperand(0))
10006       return false;
10007 
10008     // Increment the number of extractions done.
10009     ++NumExtracts;
10010   }
10011 
10012   // Ensure we have found an opcode for both parities and that they are
10013   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10014   // inputs are undef.
10015   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10016       InVec0.isUndef() || InVec1.isUndef())
10017     return false;
10018 
10019   IsSubAdd = Opc[0] == ISD::FADD;
10020 
10021   Opnd0 = InVec0;
10022   Opnd1 = InVec1;
10023   return true;
10024 }
10025 
10026 /// Returns true if is possible to fold MUL and an idiom that has already been
10027 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10028 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10029 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10030 ///
10031 /// Prior to calling this function it should be known that there is some
10032 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10033 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10034 /// before replacement of such SDNode with ADDSUB operation. Thus the number
10035 /// of \p Opnd0 uses is expected to be equal to 2.
10036 /// For example, this function may be called for the following IR:
10037 ///    %AB = fmul fast <2 x double> %A, %B
10038 ///    %Sub = fsub fast <2 x double> %AB, %C
10039 ///    %Add = fadd fast <2 x double> %AB, %C
10040 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10041 ///                            <2 x i32> <i32 0, i32 3>
10042 /// There is a def for %Addsub here, which potentially can be replaced by
10043 /// X86ISD::ADDSUB operation:
10044 ///    %Addsub = X86ISD::ADDSUB %AB, %C
10045 /// and such ADDSUB can further be replaced with FMADDSUB:
10046 ///    %Addsub = FMADDSUB %A, %B, %C.
10047 ///
10048 /// The main reason why this method is called before the replacement of the
10049 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10050 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10051 /// FMADDSUB is.
10052 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10053                                  SelectionDAG &DAG,
10054                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10055                                  unsigned ExpectedUses) {
10056   if (Opnd0.getOpcode() != ISD::FMUL ||
10057       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10058     return false;
10059 
10060   // FIXME: These checks must match the similar ones in
10061   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10062   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10063   // or MUL + ADDSUB to FMADDSUB.
10064   const TargetOptions &Options = DAG.getTarget().Options;
10065   bool AllowFusion =
10066       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10067   if (!AllowFusion)
10068     return false;
10069 
10070   Opnd2 = Opnd1;
10071   Opnd1 = Opnd0.getOperand(1);
10072   Opnd0 = Opnd0.getOperand(0);
10073 
10074   return true;
10075 }
10076 
10077 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10078 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10079 /// X86ISD::FMSUBADD node.
10080 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10081                                        const X86Subtarget &Subtarget,
10082                                        SelectionDAG &DAG) {
10083   SDValue Opnd0, Opnd1;
10084   unsigned NumExtracts;
10085   bool IsSubAdd;
10086   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10087                         IsSubAdd))
10088     return SDValue();
10089 
10090   MVT VT = BV->getSimpleValueType(0);
10091   SDLoc DL(BV);
10092 
10093   // Try to generate X86ISD::FMADDSUB node here.
10094   SDValue Opnd2;
10095   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10096     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10097     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10098   }
10099 
10100   // We only support ADDSUB.
10101   if (IsSubAdd)
10102     return SDValue();
10103 
10104   // There are no known X86 targets with 512-bit ADDSUB instructions!
10105   // Convert to blend(fsub,fadd).
10106   if (VT.is512BitVector()) {
10107     SmallVector<int> Mask;
10108     for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10109         Mask.push_back(I);
10110         Mask.push_back(I + E + 1);
10111     }
10112     SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10113     SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10114     return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10115   }
10116 
10117   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10118 }
10119 
10120 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10121                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10122   // Initialize outputs to known values.
10123   MVT VT = BV->getSimpleValueType(0);
10124   HOpcode = ISD::DELETED_NODE;
10125   V0 = DAG.getUNDEF(VT);
10126   V1 = DAG.getUNDEF(VT);
10127 
10128   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10129   // half of the result is calculated independently from the 128-bit halves of
10130   // the inputs, so that makes the index-checking logic below more complicated.
10131   unsigned NumElts = VT.getVectorNumElements();
10132   unsigned GenericOpcode = ISD::DELETED_NODE;
10133   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10134   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10135   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10136   for (unsigned i = 0; i != Num128BitChunks; ++i) {
10137     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10138       // Ignore undef elements.
10139       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10140       if (Op.isUndef())
10141         continue;
10142 
10143       // If there's an opcode mismatch, we're done.
10144       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10145         return false;
10146 
10147       // Initialize horizontal opcode.
10148       if (HOpcode == ISD::DELETED_NODE) {
10149         GenericOpcode = Op.getOpcode();
10150         switch (GenericOpcode) {
10151         case ISD::ADD: HOpcode = X86ISD::HADD; break;
10152         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10153         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10154         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10155         default: return false;
10156         }
10157       }
10158 
10159       SDValue Op0 = Op.getOperand(0);
10160       SDValue Op1 = Op.getOperand(1);
10161       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10162           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10163           Op0.getOperand(0) != Op1.getOperand(0) ||
10164           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10165           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10166         return false;
10167 
10168       // The source vector is chosen based on which 64-bit half of the
10169       // destination vector is being calculated.
10170       if (j < NumEltsIn64Bits) {
10171         if (V0.isUndef())
10172           V0 = Op0.getOperand(0);
10173       } else {
10174         if (V1.isUndef())
10175           V1 = Op0.getOperand(0);
10176       }
10177 
10178       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10179       if (SourceVec != Op0.getOperand(0))
10180         return false;
10181 
10182       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10183       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10184       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10185       unsigned ExpectedIndex = i * NumEltsIn128Bits +
10186                                (j % NumEltsIn64Bits) * 2;
10187       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10188         continue;
10189 
10190       // If this is not a commutative op, this does not match.
10191       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10192         return false;
10193 
10194       // Addition is commutative, so try swapping the extract indexes.
10195       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10196       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10197         continue;
10198 
10199       // Extract indexes do not match horizontal requirement.
10200       return false;
10201     }
10202   }
10203   // We matched. Opcode and operands are returned by reference as arguments.
10204   return true;
10205 }
10206 
10207 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10208                                     SelectionDAG &DAG, unsigned HOpcode,
10209                                     SDValue V0, SDValue V1) {
10210   // If either input vector is not the same size as the build vector,
10211   // extract/insert the low bits to the correct size.
10212   // This is free (examples: zmm --> xmm, xmm --> ymm).
10213   MVT VT = BV->getSimpleValueType(0);
10214   unsigned Width = VT.getSizeInBits();
10215   if (V0.getValueSizeInBits() > Width)
10216     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10217   else if (V0.getValueSizeInBits() < Width)
10218     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10219 
10220   if (V1.getValueSizeInBits() > Width)
10221     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10222   else if (V1.getValueSizeInBits() < Width)
10223     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10224 
10225   unsigned NumElts = VT.getVectorNumElements();
10226   APInt DemandedElts = APInt::getAllOnes(NumElts);
10227   for (unsigned i = 0; i != NumElts; ++i)
10228     if (BV->getOperand(i).isUndef())
10229       DemandedElts.clearBit(i);
10230 
10231   // If we don't need the upper xmm, then perform as a xmm hop.
10232   unsigned HalfNumElts = NumElts / 2;
10233   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10234     MVT HalfVT = VT.getHalfNumVectorElementsVT();
10235     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10236     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10237     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10238     return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10239   }
10240 
10241   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10242 }
10243 
10244 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10245 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10246                                    const X86Subtarget &Subtarget,
10247                                    SelectionDAG &DAG) {
10248   // We need at least 2 non-undef elements to make this worthwhile by default.
10249   unsigned NumNonUndefs =
10250       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10251   if (NumNonUndefs < 2)
10252     return SDValue();
10253 
10254   // There are 4 sets of horizontal math operations distinguished by type:
10255   // int/FP at 128-bit/256-bit. Each type was introduced with a different
10256   // subtarget feature. Try to match those "native" patterns first.
10257   MVT VT = BV->getSimpleValueType(0);
10258   if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10259       ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10260       ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10261       ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10262     unsigned HOpcode;
10263     SDValue V0, V1;
10264     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10265       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10266   }
10267 
10268   // Try harder to match 256-bit ops by using extract/concat.
10269   if (!Subtarget.hasAVX() || !VT.is256BitVector())
10270     return SDValue();
10271 
10272   // Count the number of UNDEF operands in the build_vector in input.
10273   unsigned NumElts = VT.getVectorNumElements();
10274   unsigned Half = NumElts / 2;
10275   unsigned NumUndefsLO = 0;
10276   unsigned NumUndefsHI = 0;
10277   for (unsigned i = 0, e = Half; i != e; ++i)
10278     if (BV->getOperand(i)->isUndef())
10279       NumUndefsLO++;
10280 
10281   for (unsigned i = Half, e = NumElts; i != e; ++i)
10282     if (BV->getOperand(i)->isUndef())
10283       NumUndefsHI++;
10284 
10285   SDLoc DL(BV);
10286   SDValue InVec0, InVec1;
10287   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10288     SDValue InVec2, InVec3;
10289     unsigned X86Opcode;
10290     bool CanFold = true;
10291 
10292     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10293         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10294                               InVec3) &&
10295         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10296         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10297       X86Opcode = X86ISD::HADD;
10298     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10299                                    InVec1) &&
10300              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10301                                    InVec3) &&
10302              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10303              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10304       X86Opcode = X86ISD::HSUB;
10305     else
10306       CanFold = false;
10307 
10308     if (CanFold) {
10309       // Do not try to expand this build_vector into a pair of horizontal
10310       // add/sub if we can emit a pair of scalar add/sub.
10311       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10312         return SDValue();
10313 
10314       // Convert this build_vector into a pair of horizontal binops followed by
10315       // a concat vector. We must adjust the outputs from the partial horizontal
10316       // matching calls above to account for undefined vector halves.
10317       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10318       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10319       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
10320       bool isUndefLO = NumUndefsLO == Half;
10321       bool isUndefHI = NumUndefsHI == Half;
10322       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10323                                    isUndefHI);
10324     }
10325   }
10326 
10327   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10328       VT == MVT::v16i16) {
10329     unsigned X86Opcode;
10330     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10331       X86Opcode = X86ISD::HADD;
10332     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10333                                    InVec1))
10334       X86Opcode = X86ISD::HSUB;
10335     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10336                                    InVec1))
10337       X86Opcode = X86ISD::FHADD;
10338     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10339                                    InVec1))
10340       X86Opcode = X86ISD::FHSUB;
10341     else
10342       return SDValue();
10343 
10344     // Don't try to expand this build_vector into a pair of horizontal add/sub
10345     // if we can simply emit a pair of scalar add/sub.
10346     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10347       return SDValue();
10348 
10349     // Convert this build_vector into two horizontal add/sub followed by
10350     // a concat vector.
10351     bool isUndefLO = NumUndefsLO == Half;
10352     bool isUndefHI = NumUndefsHI == Half;
10353     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10354                                  isUndefLO, isUndefHI);
10355   }
10356 
10357   return SDValue();
10358 }
10359 
10360 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10361                           SelectionDAG &DAG);
10362 
10363 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
10364 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10365 /// just apply the bit to the vectors.
10366 /// NOTE: Its not in our interest to start make a general purpose vectorizer
10367 /// from this, but enough scalar bit operations are created from the later
10368 /// legalization + scalarization stages to need basic support.
10369 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10370                                        const X86Subtarget &Subtarget,
10371                                        SelectionDAG &DAG) {
10372   SDLoc DL(Op);
10373   MVT VT = Op->getSimpleValueType(0);
10374   unsigned NumElems = VT.getVectorNumElements();
10375   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10376 
10377   // Check that all elements have the same opcode.
10378   // TODO: Should we allow UNDEFS and if so how many?
10379   unsigned Opcode = Op->getOperand(0).getOpcode();
10380   for (unsigned i = 1; i < NumElems; ++i)
10381     if (Opcode != Op->getOperand(i).getOpcode())
10382       return SDValue();
10383 
10384   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10385   bool IsShift = false;
10386   switch (Opcode) {
10387   default:
10388     return SDValue();
10389   case ISD::SHL:
10390   case ISD::SRL:
10391   case ISD::SRA:
10392     IsShift = true;
10393     break;
10394   case ISD::AND:
10395   case ISD::XOR:
10396   case ISD::OR:
10397     // Don't do this if the buildvector is a splat - we'd replace one
10398     // constant with an entire vector.
10399     if (Op->getSplatValue())
10400       return SDValue();
10401     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10402       return SDValue();
10403     break;
10404   }
10405 
10406   SmallVector<SDValue, 4> LHSElts, RHSElts;
10407   for (SDValue Elt : Op->ops()) {
10408     SDValue LHS = Elt.getOperand(0);
10409     SDValue RHS = Elt.getOperand(1);
10410 
10411     // We expect the canonicalized RHS operand to be the constant.
10412     if (!isa<ConstantSDNode>(RHS))
10413       return SDValue();
10414 
10415     // Extend shift amounts.
10416     if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10417       if (!IsShift)
10418         return SDValue();
10419       RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10420     }
10421 
10422     LHSElts.push_back(LHS);
10423     RHSElts.push_back(RHS);
10424   }
10425 
10426   // Limit to shifts by uniform immediates.
10427   // TODO: Only accept vXi8/vXi64 special cases?
10428   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10429   if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10430     return SDValue();
10431 
10432   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10433   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10434   SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10435 
10436   if (!IsShift)
10437     return Res;
10438 
10439   // Immediately lower the shift to ensure the constant build vector doesn't
10440   // get converted to a constant pool before the shift is lowered.
10441   return LowerShift(Res, Subtarget, DAG);
10442 }
10443 
10444 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
10445 /// functionality to do this, so it's all zeros, all ones, or some derivation
10446 /// that is cheap to calculate.
10447 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10448                                          const X86Subtarget &Subtarget) {
10449   SDLoc DL(Op);
10450   MVT VT = Op.getSimpleValueType();
10451 
10452   // Vectors containing all zeros can be matched by pxor and xorps.
10453   if (ISD::isBuildVectorAllZeros(Op.getNode()))
10454     return Op;
10455 
10456   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10457   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10458   // vpcmpeqd on 256-bit vectors.
10459   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10460     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10461       return Op;
10462 
10463     return getOnesVector(VT, DAG, DL);
10464   }
10465 
10466   return SDValue();
10467 }
10468 
10469 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10470 /// from a vector of source values and a vector of extraction indices.
10471 /// The vectors might be manipulated to match the type of the permute op.
10472 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10473                                      SDLoc &DL, SelectionDAG &DAG,
10474                                      const X86Subtarget &Subtarget) {
10475   MVT ShuffleVT = VT;
10476   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10477   unsigned NumElts = VT.getVectorNumElements();
10478   unsigned SizeInBits = VT.getSizeInBits();
10479 
10480   // Adjust IndicesVec to match VT size.
10481   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
10482          "Illegal variable permute mask size");
10483   if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10484     // Narrow/widen the indices vector to the correct size.
10485     if (IndicesVec.getValueSizeInBits() > SizeInBits)
10486       IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10487                                     NumElts * VT.getScalarSizeInBits());
10488     else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10489       IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10490                                   SDLoc(IndicesVec), SizeInBits);
10491     // Zero-extend the index elements within the vector.
10492     if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10493       IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10494                                IndicesVT, IndicesVec);
10495   }
10496   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10497 
10498   // Handle SrcVec that don't match VT type.
10499   if (SrcVec.getValueSizeInBits() != SizeInBits) {
10500     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10501       // Handle larger SrcVec by treating it as a larger permute.
10502       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10503       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10504       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10505       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10506                                   Subtarget, DAG, SDLoc(IndicesVec));
10507       SDValue NewSrcVec =
10508           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10509       if (NewSrcVec)
10510         return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10511       return SDValue();
10512     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10513       // Widen smaller SrcVec to match VT.
10514       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10515     } else
10516       return SDValue();
10517   }
10518 
10519   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10520     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
10521     EVT SrcVT = Idx.getValueType();
10522     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10523     uint64_t IndexScale = 0;
10524     uint64_t IndexOffset = 0;
10525 
10526     // If we're scaling a smaller permute op, then we need to repeat the
10527     // indices, scaling and offsetting them as well.
10528     // e.g. v4i32 -> v16i8 (Scale = 4)
10529     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10530     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10531     for (uint64_t i = 0; i != Scale; ++i) {
10532       IndexScale |= Scale << (i * NumDstBits);
10533       IndexOffset |= i << (i * NumDstBits);
10534     }
10535 
10536     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10537                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10538     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10539                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10540     return Idx;
10541   };
10542 
10543   unsigned Opcode = 0;
10544   switch (VT.SimpleTy) {
10545   default:
10546     break;
10547   case MVT::v16i8:
10548     if (Subtarget.hasSSSE3())
10549       Opcode = X86ISD::PSHUFB;
10550     break;
10551   case MVT::v8i16:
10552     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10553       Opcode = X86ISD::VPERMV;
10554     else if (Subtarget.hasSSSE3()) {
10555       Opcode = X86ISD::PSHUFB;
10556       ShuffleVT = MVT::v16i8;
10557     }
10558     break;
10559   case MVT::v4f32:
10560   case MVT::v4i32:
10561     if (Subtarget.hasAVX()) {
10562       Opcode = X86ISD::VPERMILPV;
10563       ShuffleVT = MVT::v4f32;
10564     } else if (Subtarget.hasSSSE3()) {
10565       Opcode = X86ISD::PSHUFB;
10566       ShuffleVT = MVT::v16i8;
10567     }
10568     break;
10569   case MVT::v2f64:
10570   case MVT::v2i64:
10571     if (Subtarget.hasAVX()) {
10572       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10573       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10574       Opcode = X86ISD::VPERMILPV;
10575       ShuffleVT = MVT::v2f64;
10576     } else if (Subtarget.hasSSE41()) {
10577       // SSE41 can compare v2i64 - select between indices 0 and 1.
10578       return DAG.getSelectCC(
10579           DL, IndicesVec,
10580           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10581           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10582           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10583           ISD::CondCode::SETEQ);
10584     }
10585     break;
10586   case MVT::v32i8:
10587     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10588       Opcode = X86ISD::VPERMV;
10589     else if (Subtarget.hasXOP()) {
10590       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10591       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10592       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10593       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10594       return DAG.getNode(
10595           ISD::CONCAT_VECTORS, DL, VT,
10596           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10597           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10598     } else if (Subtarget.hasAVX()) {
10599       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10600       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10601       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10602       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10603       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10604                               ArrayRef<SDValue> Ops) {
10605         // Permute Lo and Hi and then select based on index range.
10606         // This works as SHUFB uses bits[3:0] to permute elements and we don't
10607         // care about the bit[7] as its just an index vector.
10608         SDValue Idx = Ops[2];
10609         EVT VT = Idx.getValueType();
10610         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10611                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10612                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10613                                ISD::CondCode::SETGT);
10614       };
10615       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10616       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10617                               PSHUFBBuilder);
10618     }
10619     break;
10620   case MVT::v16i16:
10621     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10622       Opcode = X86ISD::VPERMV;
10623     else if (Subtarget.hasAVX()) {
10624       // Scale to v32i8 and perform as v32i8.
10625       IndicesVec = ScaleIndices(IndicesVec, 2);
10626       return DAG.getBitcast(
10627           VT, createVariablePermute(
10628                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10629                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10630     }
10631     break;
10632   case MVT::v8f32:
10633   case MVT::v8i32:
10634     if (Subtarget.hasAVX2())
10635       Opcode = X86ISD::VPERMV;
10636     else if (Subtarget.hasAVX()) {
10637       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10638       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10639                                           {0, 1, 2, 3, 0, 1, 2, 3});
10640       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10641                                           {4, 5, 6, 7, 4, 5, 6, 7});
10642       if (Subtarget.hasXOP())
10643         return DAG.getBitcast(
10644             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10645                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10646       // Permute Lo and Hi and then select based on index range.
10647       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10648       SDValue Res = DAG.getSelectCC(
10649           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10650           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10651           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10652           ISD::CondCode::SETGT);
10653       return DAG.getBitcast(VT, Res);
10654     }
10655     break;
10656   case MVT::v4i64:
10657   case MVT::v4f64:
10658     if (Subtarget.hasAVX512()) {
10659       if (!Subtarget.hasVLX()) {
10660         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10661         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10662                                 SDLoc(SrcVec));
10663         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10664                                     DAG, SDLoc(IndicesVec));
10665         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10666                                             DAG, Subtarget);
10667         return extract256BitVector(Res, 0, DAG, DL);
10668       }
10669       Opcode = X86ISD::VPERMV;
10670     } else if (Subtarget.hasAVX()) {
10671       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10672       SDValue LoLo =
10673           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10674       SDValue HiHi =
10675           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10676       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10677       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10678       if (Subtarget.hasXOP())
10679         return DAG.getBitcast(
10680             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10681                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10682       // Permute Lo and Hi and then select based on index range.
10683       // This works as VPERMILPD only uses index bit[1] to permute elements.
10684       SDValue Res = DAG.getSelectCC(
10685           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10686           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10687           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10688           ISD::CondCode::SETGT);
10689       return DAG.getBitcast(VT, Res);
10690     }
10691     break;
10692   case MVT::v64i8:
10693     if (Subtarget.hasVBMI())
10694       Opcode = X86ISD::VPERMV;
10695     break;
10696   case MVT::v32i16:
10697     if (Subtarget.hasBWI())
10698       Opcode = X86ISD::VPERMV;
10699     break;
10700   case MVT::v16f32:
10701   case MVT::v16i32:
10702   case MVT::v8f64:
10703   case MVT::v8i64:
10704     if (Subtarget.hasAVX512())
10705       Opcode = X86ISD::VPERMV;
10706     break;
10707   }
10708   if (!Opcode)
10709     return SDValue();
10710 
10711   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
10712          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
10713          "Illegal variable permute shuffle type");
10714 
10715   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10716   if (Scale > 1)
10717     IndicesVec = ScaleIndices(IndicesVec, Scale);
10718 
10719   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10720   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10721 
10722   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10723   SDValue Res = Opcode == X86ISD::VPERMV
10724                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10725                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10726   return DAG.getBitcast(VT, Res);
10727 }
10728 
10729 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10730 // reasoned to be a permutation of a vector by indices in a non-constant vector.
10731 // (build_vector (extract_elt V, (extract_elt I, 0)),
10732 //               (extract_elt V, (extract_elt I, 1)),
10733 //                    ...
10734 // ->
10735 // (vpermv I, V)
10736 //
10737 // TODO: Handle undefs
10738 // TODO: Utilize pshufb and zero mask blending to support more efficient
10739 // construction of vectors with constant-0 elements.
10740 static SDValue
10741 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10742                                    const X86Subtarget &Subtarget) {
10743   SDValue SrcVec, IndicesVec;
10744   // Check for a match of the permute source vector and permute index elements.
10745   // This is done by checking that the i-th build_vector operand is of the form:
10746   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10747   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10748     SDValue Op = V.getOperand(Idx);
10749     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10750       return SDValue();
10751 
10752     // If this is the first extract encountered in V, set the source vector,
10753     // otherwise verify the extract is from the previously defined source
10754     // vector.
10755     if (!SrcVec)
10756       SrcVec = Op.getOperand(0);
10757     else if (SrcVec != Op.getOperand(0))
10758       return SDValue();
10759     SDValue ExtractedIndex = Op->getOperand(1);
10760     // Peek through extends.
10761     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10762         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10763       ExtractedIndex = ExtractedIndex.getOperand(0);
10764     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10765       return SDValue();
10766 
10767     // If this is the first extract from the index vector candidate, set the
10768     // indices vector, otherwise verify the extract is from the previously
10769     // defined indices vector.
10770     if (!IndicesVec)
10771       IndicesVec = ExtractedIndex.getOperand(0);
10772     else if (IndicesVec != ExtractedIndex.getOperand(0))
10773       return SDValue();
10774 
10775     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10776     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10777       return SDValue();
10778   }
10779 
10780   SDLoc DL(V);
10781   MVT VT = V.getSimpleValueType();
10782   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10783 }
10784 
10785 SDValue
10786 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10787   SDLoc dl(Op);
10788 
10789   MVT VT = Op.getSimpleValueType();
10790   MVT EltVT = VT.getVectorElementType();
10791   unsigned NumElems = Op.getNumOperands();
10792 
10793   // Generate vectors for predicate vectors.
10794   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10795     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10796 
10797   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10798     return VectorConstant;
10799 
10800   unsigned EVTBits = EltVT.getSizeInBits();
10801   APInt UndefMask = APInt::getZero(NumElems);
10802   APInt ZeroMask = APInt::getZero(NumElems);
10803   APInt NonZeroMask = APInt::getZero(NumElems);
10804   bool IsAllConstants = true;
10805   SmallSet<SDValue, 8> Values;
10806   unsigned NumConstants = NumElems;
10807   for (unsigned i = 0; i < NumElems; ++i) {
10808     SDValue Elt = Op.getOperand(i);
10809     if (Elt.isUndef()) {
10810       UndefMask.setBit(i);
10811       continue;
10812     }
10813     Values.insert(Elt);
10814     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10815       IsAllConstants = false;
10816       NumConstants--;
10817     }
10818     if (X86::isZeroNode(Elt)) {
10819       ZeroMask.setBit(i);
10820     } else {
10821       NonZeroMask.setBit(i);
10822     }
10823   }
10824 
10825   // All undef vector. Return an UNDEF. All zero vectors were handled above.
10826   if (NonZeroMask == 0) {
10827     assert(UndefMask.isAllOnes() && "Fully undef mask expected");
10828     return DAG.getUNDEF(VT);
10829   }
10830 
10831   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10832 
10833   // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10834   // lowering to a smaller build vector and padding with undef/zero.
10835   if ((VT.is256BitVector() || VT.is512BitVector()) &&
10836       !isFoldableUseOfShuffle(BV)) {
10837     unsigned UpperElems = NumElems / 2;
10838     APInt UndefOrZeroMask = UndefMask | ZeroMask;
10839     unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10840     if (NumUpperUndefsOrZeros >= UpperElems) {
10841       if (VT.is512BitVector() &&
10842           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10843         UpperElems = NumElems - (NumElems / 4);
10844       bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10845       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10846       SDValue NewBV =
10847           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10848       return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10849     }
10850   }
10851 
10852   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10853     return AddSub;
10854   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10855     return HorizontalOp;
10856   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10857     return Broadcast;
10858   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10859     return BitOp;
10860 
10861   unsigned NumZero = ZeroMask.countPopulation();
10862   unsigned NumNonZero = NonZeroMask.countPopulation();
10863 
10864   // If we are inserting one variable into a vector of non-zero constants, try
10865   // to avoid loading each constant element as a scalar. Load the constants as a
10866   // vector and then insert the variable scalar element. If insertion is not
10867   // supported, fall back to a shuffle to get the scalar blended with the
10868   // constants. Insertion into a zero vector is handled as a special-case
10869   // somewhere below here.
10870   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10871       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10872        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10873     // Create an all-constant vector. The variable element in the old
10874     // build vector is replaced by undef in the constant vector. Save the
10875     // variable scalar element and its index for use in the insertelement.
10876     LLVMContext &Context = *DAG.getContext();
10877     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10878     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10879     SDValue VarElt;
10880     SDValue InsIndex;
10881     for (unsigned i = 0; i != NumElems; ++i) {
10882       SDValue Elt = Op.getOperand(i);
10883       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10884         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10885       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10886         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10887       else if (!Elt.isUndef()) {
10888         assert(!VarElt.getNode() && !InsIndex.getNode() &&
10889                "Expected one variable element in this vector");
10890         VarElt = Elt;
10891         InsIndex = DAG.getVectorIdxConstant(i, dl);
10892       }
10893     }
10894     Constant *CV = ConstantVector::get(ConstVecOps);
10895     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10896 
10897     // The constants we just created may not be legal (eg, floating point). We
10898     // must lower the vector right here because we can not guarantee that we'll
10899     // legalize it before loading it. This is also why we could not just create
10900     // a new build vector here. If the build vector contains illegal constants,
10901     // it could get split back up into a series of insert elements.
10902     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10903     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10904     MachineFunction &MF = DAG.getMachineFunction();
10905     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10906     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10907     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10908     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10909     if (InsertC < NumEltsInLow128Bits)
10910       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10911 
10912     // There's no good way to insert into the high elements of a >128-bit
10913     // vector, so use shuffles to avoid an extract/insert sequence.
10914     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
10915     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
10916     SmallVector<int, 8> ShuffleMask;
10917     unsigned NumElts = VT.getVectorNumElements();
10918     for (unsigned i = 0; i != NumElts; ++i)
10919       ShuffleMask.push_back(i == InsertC ? NumElts : i);
10920     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10921     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10922   }
10923 
10924   // Special case for single non-zero, non-undef, element.
10925   if (NumNonZero == 1) {
10926     unsigned Idx = NonZeroMask.countTrailingZeros();
10927     SDValue Item = Op.getOperand(Idx);
10928 
10929     // If we have a constant or non-constant insertion into the low element of
10930     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10931     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
10932     // depending on what the source datatype is.
10933     if (Idx == 0) {
10934       if (NumZero == 0)
10935         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10936 
10937       if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
10938           EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
10939           (EltVT == MVT::i16 && Subtarget.hasFP16())) {
10940         assert((VT.is128BitVector() || VT.is256BitVector() ||
10941                 VT.is512BitVector()) &&
10942                "Expected an SSE value type!");
10943         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10944         // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
10945         // zero vector.
10946         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10947       }
10948 
10949       // We can't directly insert an i8 or i16 into a vector, so zero extend
10950       // it to i32 first.
10951       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10952         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10953         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10954         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10955         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10956         return DAG.getBitcast(VT, Item);
10957       }
10958     }
10959 
10960     // Is it a vector logical left shift?
10961     if (NumElems == 2 && Idx == 1 &&
10962         X86::isZeroNode(Op.getOperand(0)) &&
10963         !X86::isZeroNode(Op.getOperand(1))) {
10964       unsigned NumBits = VT.getSizeInBits();
10965       return getVShift(true, VT,
10966                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10967                                    VT, Op.getOperand(1)),
10968                        NumBits/2, DAG, *this, dl);
10969     }
10970 
10971     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10972       return SDValue();
10973 
10974     // Otherwise, if this is a vector with i32 or f32 elements, and the element
10975     // is a non-constant being inserted into an element other than the low one,
10976     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
10977     // movd/movss) to move this into the low element, then shuffle it into
10978     // place.
10979     if (EVTBits == 32) {
10980       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10981       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10982     }
10983   }
10984 
10985   // Splat is obviously ok. Let legalizer expand it to a shuffle.
10986   if (Values.size() == 1) {
10987     if (EVTBits == 32) {
10988       // Instead of a shuffle like this:
10989       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10990       // Check if it's possible to issue this instead.
10991       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10992       unsigned Idx = NonZeroMask.countTrailingZeros();
10993       SDValue Item = Op.getOperand(Idx);
10994       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10995         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10996     }
10997     return SDValue();
10998   }
10999 
11000   // A vector full of immediates; various special cases are already
11001   // handled, so this is best done with a single constant-pool load.
11002   if (IsAllConstants)
11003     return SDValue();
11004 
11005   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11006       return V;
11007 
11008   // See if we can use a vector load to get all of the elements.
11009   {
11010     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11011     if (SDValue LD =
11012             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11013       return LD;
11014   }
11015 
11016   // If this is a splat of pairs of 32-bit elements, we can use a narrower
11017   // build_vector and broadcast it.
11018   // TODO: We could probably generalize this more.
11019   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11020     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11021                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11022     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11023       // Make sure all the even/odd operands match.
11024       for (unsigned i = 2; i != NumElems; ++i)
11025         if (Ops[i % 2] != Op.getOperand(i))
11026           return false;
11027       return true;
11028     };
11029     if (CanSplat(Op, NumElems, Ops)) {
11030       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11031       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11032       // Create a new build vector and cast to v2i64/v2f64.
11033       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11034                                      DAG.getBuildVector(NarrowVT, dl, Ops));
11035       // Broadcast from v2i64/v2f64 and cast to final VT.
11036       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11037       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11038                                             NewBV));
11039     }
11040   }
11041 
11042   // For AVX-length vectors, build the individual 128-bit pieces and use
11043   // shuffles to put them in place.
11044   if (VT.getSizeInBits() > 128) {
11045     MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11046 
11047     // Build both the lower and upper subvector.
11048     SDValue Lower =
11049         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11050     SDValue Upper = DAG.getBuildVector(
11051         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11052 
11053     // Recreate the wider vector with the lower and upper part.
11054     return concatSubVectors(Lower, Upper, DAG, dl);
11055   }
11056 
11057   // Let legalizer expand 2-wide build_vectors.
11058   if (EVTBits == 64) {
11059     if (NumNonZero == 1) {
11060       // One half is zero or undef.
11061       unsigned Idx = NonZeroMask.countTrailingZeros();
11062       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11063                                Op.getOperand(Idx));
11064       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11065     }
11066     return SDValue();
11067   }
11068 
11069   // If element VT is < 32 bits, convert it to inserts into a zero vector.
11070   if (EVTBits == 8 && NumElems == 16)
11071     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11072                                           DAG, Subtarget))
11073       return V;
11074 
11075   if (EltVT == MVT::i16 && NumElems == 8)
11076     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11077                                           DAG, Subtarget))
11078       return V;
11079 
11080   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11081   if (EVTBits == 32 && NumElems == 4)
11082     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11083       return V;
11084 
11085   // If element VT is == 32 bits, turn it into a number of shuffles.
11086   if (NumElems == 4 && NumZero > 0) {
11087     SmallVector<SDValue, 8> Ops(NumElems);
11088     for (unsigned i = 0; i < 4; ++i) {
11089       bool isZero = !NonZeroMask[i];
11090       if (isZero)
11091         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11092       else
11093         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11094     }
11095 
11096     for (unsigned i = 0; i < 2; ++i) {
11097       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11098         default: llvm_unreachable("Unexpected NonZero count");
11099         case 0:
11100           Ops[i] = Ops[i*2];  // Must be a zero vector.
11101           break;
11102         case 1:
11103           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11104           break;
11105         case 2:
11106           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11107           break;
11108         case 3:
11109           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11110           break;
11111       }
11112     }
11113 
11114     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11115     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11116     int MaskVec[] = {
11117       Reverse1 ? 1 : 0,
11118       Reverse1 ? 0 : 1,
11119       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11120       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
11121     };
11122     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11123   }
11124 
11125   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
11126 
11127   // Check for a build vector from mostly shuffle plus few inserting.
11128   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11129     return Sh;
11130 
11131   // For SSE 4.1, use insertps to put the high elements into the low element.
11132   if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11133     SDValue Result;
11134     if (!Op.getOperand(0).isUndef())
11135       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11136     else
11137       Result = DAG.getUNDEF(VT);
11138 
11139     for (unsigned i = 1; i < NumElems; ++i) {
11140       if (Op.getOperand(i).isUndef()) continue;
11141       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11142                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11143     }
11144     return Result;
11145   }
11146 
11147   // Otherwise, expand into a number of unpckl*, start by extending each of
11148   // our (non-undef) elements to the full vector width with the element in the
11149   // bottom slot of the vector (which generates no code for SSE).
11150   SmallVector<SDValue, 8> Ops(NumElems);
11151   for (unsigned i = 0; i < NumElems; ++i) {
11152     if (!Op.getOperand(i).isUndef())
11153       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11154     else
11155       Ops[i] = DAG.getUNDEF(VT);
11156   }
11157 
11158   // Next, we iteratively mix elements, e.g. for v4f32:
11159   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11160   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11161   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
11162   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11163     // Generate scaled UNPCKL shuffle mask.
11164     SmallVector<int, 16> Mask;
11165     for(unsigned i = 0; i != Scale; ++i)
11166       Mask.push_back(i);
11167     for (unsigned i = 0; i != Scale; ++i)
11168       Mask.push_back(NumElems+i);
11169     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11170 
11171     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11172       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11173   }
11174   return Ops[0];
11175 }
11176 
11177 // 256-bit AVX can use the vinsertf128 instruction
11178 // to create 256-bit vectors from two other 128-bit ones.
11179 // TODO: Detect subvector broadcast here instead of DAG combine?
11180 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11181                                       const X86Subtarget &Subtarget) {
11182   SDLoc dl(Op);
11183   MVT ResVT = Op.getSimpleValueType();
11184 
11185   assert((ResVT.is256BitVector() ||
11186           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
11187 
11188   unsigned NumOperands = Op.getNumOperands();
11189   unsigned NumZero = 0;
11190   unsigned NumNonZero = 0;
11191   unsigned NonZeros = 0;
11192   for (unsigned i = 0; i != NumOperands; ++i) {
11193     SDValue SubVec = Op.getOperand(i);
11194     if (SubVec.isUndef())
11195       continue;
11196     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11197       ++NumZero;
11198     else {
11199       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
11200       NonZeros |= 1 << i;
11201       ++NumNonZero;
11202     }
11203   }
11204 
11205   // If we have more than 2 non-zeros, build each half separately.
11206   if (NumNonZero > 2) {
11207     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11208     ArrayRef<SDUse> Ops = Op->ops();
11209     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11210                              Ops.slice(0, NumOperands/2));
11211     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11212                              Ops.slice(NumOperands/2));
11213     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11214   }
11215 
11216   // Otherwise, build it up through insert_subvectors.
11217   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11218                         : DAG.getUNDEF(ResVT);
11219 
11220   MVT SubVT = Op.getOperand(0).getSimpleValueType();
11221   unsigned NumSubElems = SubVT.getVectorNumElements();
11222   for (unsigned i = 0; i != NumOperands; ++i) {
11223     if ((NonZeros & (1 << i)) == 0)
11224       continue;
11225 
11226     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11227                       Op.getOperand(i),
11228                       DAG.getIntPtrConstant(i * NumSubElems, dl));
11229   }
11230 
11231   return Vec;
11232 }
11233 
11234 // Returns true if the given node is a type promotion (by concatenating i1
11235 // zeros) of the result of a node that already zeros all upper bits of
11236 // k-register.
11237 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
11238 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11239                                        const X86Subtarget &Subtarget,
11240                                        SelectionDAG & DAG) {
11241   SDLoc dl(Op);
11242   MVT ResVT = Op.getSimpleValueType();
11243   unsigned NumOperands = Op.getNumOperands();
11244 
11245   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
11246          "Unexpected number of operands in CONCAT_VECTORS");
11247 
11248   uint64_t Zeros = 0;
11249   uint64_t NonZeros = 0;
11250   for (unsigned i = 0; i != NumOperands; ++i) {
11251     SDValue SubVec = Op.getOperand(i);
11252     if (SubVec.isUndef())
11253       continue;
11254     assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
11255     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11256       Zeros |= (uint64_t)1 << i;
11257     else
11258       NonZeros |= (uint64_t)1 << i;
11259   }
11260 
11261   unsigned NumElems = ResVT.getVectorNumElements();
11262 
11263   // If we are inserting non-zero vector and there are zeros in LSBs and undef
11264   // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11265   // insert_subvector will give us two kshifts.
11266   if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11267       Log2_64(NonZeros) != NumOperands - 1) {
11268     MVT ShiftVT = ResVT;
11269     if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11270       ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11271     unsigned Idx = Log2_64(NonZeros);
11272     SDValue SubVec = Op.getOperand(Idx);
11273     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11274     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11275                          DAG.getUNDEF(ShiftVT), SubVec,
11276                          DAG.getIntPtrConstant(0, dl));
11277     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11278                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11279     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11280                        DAG.getIntPtrConstant(0, dl));
11281   }
11282 
11283   // If there are zero or one non-zeros we can handle this very simply.
11284   if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11285     SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11286     if (!NonZeros)
11287       return Vec;
11288     unsigned Idx = Log2_64(NonZeros);
11289     SDValue SubVec = Op.getOperand(Idx);
11290     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11291     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11292                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11293   }
11294 
11295   if (NumOperands > 2) {
11296     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11297     ArrayRef<SDUse> Ops = Op->ops();
11298     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11299                              Ops.slice(0, NumOperands/2));
11300     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11301                              Ops.slice(NumOperands/2));
11302     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11303   }
11304 
11305   assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
11306 
11307   if (ResVT.getVectorNumElements() >= 16)
11308     return Op; // The operation is legal with KUNPCK
11309 
11310   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11311                             DAG.getUNDEF(ResVT), Op.getOperand(0),
11312                             DAG.getIntPtrConstant(0, dl));
11313   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11314                      DAG.getIntPtrConstant(NumElems/2, dl));
11315 }
11316 
11317 static SDValue LowerCONCAT_VECTORS(SDValue Op,
11318                                    const X86Subtarget &Subtarget,
11319                                    SelectionDAG &DAG) {
11320   MVT VT = Op.getSimpleValueType();
11321   if (VT.getVectorElementType() == MVT::i1)
11322     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11323 
11324   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
11325          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
11326           Op.getNumOperands() == 4)));
11327 
11328   // AVX can use the vinsertf128 instruction to create 256-bit vectors
11329   // from two other 128-bit ones.
11330 
11331   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11332   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11333 }
11334 
11335 //===----------------------------------------------------------------------===//
11336 // Vector shuffle lowering
11337 //
11338 // This is an experimental code path for lowering vector shuffles on x86. It is
11339 // designed to handle arbitrary vector shuffles and blends, gracefully
11340 // degrading performance as necessary. It works hard to recognize idiomatic
11341 // shuffles and lower them to optimal instruction patterns without leaving
11342 // a framework that allows reasonably efficient handling of all vector shuffle
11343 // patterns.
11344 //===----------------------------------------------------------------------===//
11345 
11346 /// Tiny helper function to identify a no-op mask.
11347 ///
11348 /// This is a somewhat boring predicate function. It checks whether the mask
11349 /// array input, which is assumed to be a single-input shuffle mask of the kind
11350 /// used by the X86 shuffle instructions (not a fully general
11351 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11352 /// in-place shuffle are 'no-op's.
11353 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11354   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11355     assert(Mask[i] >= -1 && "Out of bound mask element!");
11356     if (Mask[i] >= 0 && Mask[i] != i)
11357       return false;
11358   }
11359   return true;
11360 }
11361 
11362 /// Test whether there are elements crossing LaneSizeInBits lanes in this
11363 /// shuffle mask.
11364 ///
11365 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11366 /// and we routinely test for these.
11367 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11368                                       unsigned ScalarSizeInBits,
11369                                       ArrayRef<int> Mask) {
11370   assert(LaneSizeInBits && ScalarSizeInBits &&
11371          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
11372          "Illegal shuffle lane size");
11373   int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11374   int Size = Mask.size();
11375   for (int i = 0; i < Size; ++i)
11376     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11377       return true;
11378   return false;
11379 }
11380 
11381 /// Test whether there are elements crossing 128-bit lanes in this
11382 /// shuffle mask.
11383 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11384   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11385 }
11386 
11387 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11388 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11389 /// better support 'repeated mask + lane permute' style shuffles.
11390 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11391                                    unsigned ScalarSizeInBits,
11392                                    ArrayRef<int> Mask) {
11393   assert(LaneSizeInBits && ScalarSizeInBits &&
11394          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
11395          "Illegal shuffle lane size");
11396   int NumElts = Mask.size();
11397   int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11398   int NumLanes = NumElts / NumEltsPerLane;
11399   if (NumLanes > 1) {
11400     for (int i = 0; i != NumLanes; ++i) {
11401       int SrcLane = -1;
11402       for (int j = 0; j != NumEltsPerLane; ++j) {
11403         int M = Mask[(i * NumEltsPerLane) + j];
11404         if (M < 0)
11405           continue;
11406         int Lane = (M % NumElts) / NumEltsPerLane;
11407         if (SrcLane >= 0 && SrcLane != Lane)
11408           return true;
11409         SrcLane = Lane;
11410       }
11411     }
11412   }
11413   return false;
11414 }
11415 
11416 /// Test whether a shuffle mask is equivalent within each sub-lane.
11417 ///
11418 /// This checks a shuffle mask to see if it is performing the same
11419 /// lane-relative shuffle in each sub-lane. This trivially implies
11420 /// that it is also not lane-crossing. It may however involve a blend from the
11421 /// same lane of a second vector.
11422 ///
11423 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11424 /// non-trivial to compute in the face of undef lanes. The representation is
11425 /// suitable for use with existing 128-bit shuffles as entries from the second
11426 /// vector have been remapped to [LaneSize, 2*LaneSize).
11427 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11428                                   ArrayRef<int> Mask,
11429                                   SmallVectorImpl<int> &RepeatedMask) {
11430   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11431   RepeatedMask.assign(LaneSize, -1);
11432   int Size = Mask.size();
11433   for (int i = 0; i < Size; ++i) {
11434     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
11435     if (Mask[i] < 0)
11436       continue;
11437     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11438       // This entry crosses lanes, so there is no way to model this shuffle.
11439       return false;
11440 
11441     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11442     // Adjust second vector indices to start at LaneSize instead of Size.
11443     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11444                                 : Mask[i] % LaneSize + LaneSize;
11445     if (RepeatedMask[i % LaneSize] < 0)
11446       // This is the first non-undef entry in this slot of a 128-bit lane.
11447       RepeatedMask[i % LaneSize] = LocalM;
11448     else if (RepeatedMask[i % LaneSize] != LocalM)
11449       // Found a mismatch with the repeated mask.
11450       return false;
11451   }
11452   return true;
11453 }
11454 
11455 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
11456 static bool
11457 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11458                                 SmallVectorImpl<int> &RepeatedMask) {
11459   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11460 }
11461 
11462 static bool
11463 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11464   SmallVector<int, 32> RepeatedMask;
11465   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11466 }
11467 
11468 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
11469 static bool
11470 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11471                                 SmallVectorImpl<int> &RepeatedMask) {
11472   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11473 }
11474 
11475 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11476 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11477 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11478                                         unsigned EltSizeInBits,
11479                                         ArrayRef<int> Mask,
11480                                         SmallVectorImpl<int> &RepeatedMask) {
11481   int LaneSize = LaneSizeInBits / EltSizeInBits;
11482   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11483   int Size = Mask.size();
11484   for (int i = 0; i < Size; ++i) {
11485     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
11486     if (Mask[i] == SM_SentinelUndef)
11487       continue;
11488     if (Mask[i] == SM_SentinelZero) {
11489       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11490         return false;
11491       RepeatedMask[i % LaneSize] = SM_SentinelZero;
11492       continue;
11493     }
11494     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11495       // This entry crosses lanes, so there is no way to model this shuffle.
11496       return false;
11497 
11498     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11499     // later vector indices to start at multiples of LaneSize instead of Size.
11500     int LaneM = Mask[i] / Size;
11501     int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11502     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11503       // This is the first non-undef entry in this slot of a 128-bit lane.
11504       RepeatedMask[i % LaneSize] = LocalM;
11505     else if (RepeatedMask[i % LaneSize] != LocalM)
11506       // Found a mismatch with the repeated mask.
11507       return false;
11508   }
11509   return true;
11510 }
11511 
11512 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11513 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11514 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11515                                         ArrayRef<int> Mask,
11516                                         SmallVectorImpl<int> &RepeatedMask) {
11517   return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11518                                      Mask, RepeatedMask);
11519 }
11520 
11521 /// Checks whether the vector elements referenced by two shuffle masks are
11522 /// equivalent.
11523 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11524                                 int Idx, int ExpectedIdx) {
11525   assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
11526          ExpectedIdx < MaskSize && "Out of range element index");
11527   if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11528     return false;
11529 
11530   switch (Op.getOpcode()) {
11531   case ISD::BUILD_VECTOR:
11532     // If the values are build vectors, we can look through them to find
11533     // equivalent inputs that make the shuffles equivalent.
11534     // TODO: Handle MaskSize != Op.getNumOperands()?
11535     if (MaskSize == (int)Op.getNumOperands() &&
11536         MaskSize == (int)ExpectedOp.getNumOperands())
11537       return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11538     break;
11539   case X86ISD::VBROADCAST:
11540   case X86ISD::VBROADCAST_LOAD:
11541     // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11542     return (Op == ExpectedOp &&
11543             (int)Op.getValueType().getVectorNumElements() == MaskSize);
11544   case X86ISD::HADD:
11545   case X86ISD::HSUB:
11546   case X86ISD::FHADD:
11547   case X86ISD::FHSUB:
11548   case X86ISD::PACKSS:
11549   case X86ISD::PACKUS:
11550     // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11551     // TODO: Handle MaskSize != NumElts?
11552     // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11553     if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11554       MVT VT = Op.getSimpleValueType();
11555       int NumElts = VT.getVectorNumElements();
11556       if (MaskSize == NumElts) {
11557         int NumLanes = VT.getSizeInBits() / 128;
11558         int NumEltsPerLane = NumElts / NumLanes;
11559         int NumHalfEltsPerLane = NumEltsPerLane / 2;
11560         bool SameLane =
11561             (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11562         bool SameElt =
11563             (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11564         return SameLane && SameElt;
11565       }
11566     }
11567     break;
11568   }
11569 
11570   return false;
11571 }
11572 
11573 /// Checks whether a shuffle mask is equivalent to an explicit list of
11574 /// arguments.
11575 ///
11576 /// This is a fast way to test a shuffle mask against a fixed pattern:
11577 ///
11578 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11579 ///
11580 /// It returns true if the mask is exactly as wide as the argument list, and
11581 /// each element of the mask is either -1 (signifying undef) or the value given
11582 /// in the argument.
11583 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11584                                 SDValue V1 = SDValue(),
11585                                 SDValue V2 = SDValue()) {
11586   int Size = Mask.size();
11587   if (Size != (int)ExpectedMask.size())
11588     return false;
11589 
11590   for (int i = 0; i < Size; ++i) {
11591     assert(Mask[i] >= -1 && "Out of bound mask element!");
11592     int MaskIdx = Mask[i];
11593     int ExpectedIdx = ExpectedMask[i];
11594     if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11595       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11596       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11597       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11598       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11599       if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11600         return false;
11601     }
11602   }
11603   return true;
11604 }
11605 
11606 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11607 ///
11608 /// The masks must be exactly the same width.
11609 ///
11610 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11611 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
11612 ///
11613 /// SM_SentinelZero is accepted as a valid negative index but must match in
11614 /// both.
11615 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11616                                       ArrayRef<int> ExpectedMask,
11617                                       SDValue V1 = SDValue(),
11618                                       SDValue V2 = SDValue()) {
11619   int Size = Mask.size();
11620   if (Size != (int)ExpectedMask.size())
11621     return false;
11622   assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
11623          "Illegal target shuffle mask");
11624 
11625   // Check for out-of-range target shuffle mask indices.
11626   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11627     return false;
11628 
11629   // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11630   if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11631     V1 = SDValue();
11632   if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11633     V2 = SDValue();
11634 
11635   for (int i = 0; i < Size; ++i) {
11636     int MaskIdx = Mask[i];
11637     int ExpectedIdx = ExpectedMask[i];
11638     if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11639       continue;
11640     if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11641       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11642       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11643       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11644       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11645       if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11646         continue;
11647     }
11648     // TODO - handle SM_Sentinel equivalences.
11649     return false;
11650   }
11651   return true;
11652 }
11653 
11654 // Attempt to create a shuffle mask from a VSELECT condition mask.
11655 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11656                                          SDValue Cond) {
11657   EVT CondVT = Cond.getValueType();
11658   unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11659   unsigned NumElts = CondVT.getVectorNumElements();
11660 
11661   APInt UndefElts;
11662   SmallVector<APInt, 32> EltBits;
11663   if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11664                                      true, false))
11665     return false;
11666 
11667   Mask.resize(NumElts, SM_SentinelUndef);
11668 
11669   for (int i = 0; i != (int)NumElts; ++i) {
11670     Mask[i] = i;
11671     // Arbitrarily choose from the 2nd operand if the select condition element
11672     // is undef.
11673     // TODO: Can we do better by matching patterns such as even/odd?
11674     if (UndefElts[i] || EltBits[i].isZero())
11675       Mask[i] += NumElts;
11676   }
11677 
11678   return true;
11679 }
11680 
11681 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11682 // instructions.
11683 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11684   if (VT != MVT::v8i32 && VT != MVT::v8f32)
11685     return false;
11686 
11687   SmallVector<int, 8> Unpcklwd;
11688   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11689                           /* Unary = */ false);
11690   SmallVector<int, 8> Unpckhwd;
11691   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11692                           /* Unary = */ false);
11693   bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11694                          isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11695   return IsUnpackwdMask;
11696 }
11697 
11698 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11699   // Create 128-bit vector type based on mask size.
11700   MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11701   MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11702 
11703   // We can't assume a canonical shuffle mask, so try the commuted version too.
11704   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11705   ShuffleVectorSDNode::commuteMask(CommutedMask);
11706 
11707   // Match any of unary/binary or low/high.
11708   for (unsigned i = 0; i != 4; ++i) {
11709     SmallVector<int, 16> UnpackMask;
11710     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11711     if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11712         isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11713       return true;
11714   }
11715   return false;
11716 }
11717 
11718 /// Return true if a shuffle mask chooses elements identically in its top and
11719 /// bottom halves. For example, any splat mask has the same top and bottom
11720 /// halves. If an element is undefined in only one half of the mask, the halves
11721 /// are not considered identical.
11722 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11723   assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
11724   unsigned HalfSize = Mask.size() / 2;
11725   for (unsigned i = 0; i != HalfSize; ++i) {
11726     if (Mask[i] != Mask[i + HalfSize])
11727       return false;
11728   }
11729   return true;
11730 }
11731 
11732 /// Get a 4-lane 8-bit shuffle immediate for a mask.
11733 ///
11734 /// This helper function produces an 8-bit shuffle immediate corresponding to
11735 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
11736 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11737 /// example.
11738 ///
11739 /// NB: We rely heavily on "undef" masks preserving the input lane.
11740 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11741   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
11742   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
11743   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
11744   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
11745   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
11746 
11747   // If the mask only uses one non-undef element, then fully 'splat' it to
11748   // improve later broadcast matching.
11749   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11750   assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
11751 
11752   int FirstElt = Mask[FirstIndex];
11753   if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11754     return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11755 
11756   unsigned Imm = 0;
11757   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11758   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11759   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11760   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11761   return Imm;
11762 }
11763 
11764 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11765                                           SelectionDAG &DAG) {
11766   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11767 }
11768 
11769 // The Shuffle result is as follow:
11770 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11771 // Each Zeroable's element correspond to a particular Mask's element.
11772 // As described in computeZeroableShuffleElements function.
11773 //
11774 // The function looks for a sub-mask that the nonzero elements are in
11775 // increasing order. If such sub-mask exist. The function returns true.
11776 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11777                                      ArrayRef<int> Mask, const EVT &VectorType,
11778                                      bool &IsZeroSideLeft) {
11779   int NextElement = -1;
11780   // Check if the Mask's nonzero elements are in increasing order.
11781   for (int i = 0, e = Mask.size(); i < e; i++) {
11782     // Checks if the mask's zeros elements are built from only zeros.
11783     assert(Mask[i] >= -1 && "Out of bound mask element!");
11784     if (Mask[i] < 0)
11785       return false;
11786     if (Zeroable[i])
11787       continue;
11788     // Find the lowest non zero element
11789     if (NextElement < 0) {
11790       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11791       IsZeroSideLeft = NextElement != 0;
11792     }
11793     // Exit if the mask's non zero elements are not in increasing order.
11794     if (NextElement != Mask[i])
11795       return false;
11796     NextElement++;
11797   }
11798   return true;
11799 }
11800 
11801 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11802 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11803                                       ArrayRef<int> Mask, SDValue V1,
11804                                       SDValue V2, const APInt &Zeroable,
11805                                       const X86Subtarget &Subtarget,
11806                                       SelectionDAG &DAG) {
11807   int Size = Mask.size();
11808   int LaneSize = 128 / VT.getScalarSizeInBits();
11809   const int NumBytes = VT.getSizeInBits() / 8;
11810   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11811 
11812   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
11813          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
11814          (Subtarget.hasBWI() && VT.is512BitVector()));
11815 
11816   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11817   // Sign bit set in i8 mask means zero element.
11818   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11819 
11820   SDValue V;
11821   for (int i = 0; i < NumBytes; ++i) {
11822     int M = Mask[i / NumEltBytes];
11823     if (M < 0) {
11824       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11825       continue;
11826     }
11827     if (Zeroable[i / NumEltBytes]) {
11828       PSHUFBMask[i] = ZeroMask;
11829       continue;
11830     }
11831 
11832     // We can only use a single input of V1 or V2.
11833     SDValue SrcV = (M >= Size ? V2 : V1);
11834     if (V && V != SrcV)
11835       return SDValue();
11836     V = SrcV;
11837     M %= Size;
11838 
11839     // PSHUFB can't cross lanes, ensure this doesn't happen.
11840     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11841       return SDValue();
11842 
11843     M = M % LaneSize;
11844     M = M * NumEltBytes + (i % NumEltBytes);
11845     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11846   }
11847   assert(V && "Failed to find a source input");
11848 
11849   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11850   return DAG.getBitcast(
11851       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11852                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11853 }
11854 
11855 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11856                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
11857                            const SDLoc &dl);
11858 
11859 // X86 has dedicated shuffle that can be lowered to VEXPAND
11860 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11861                                     const APInt &Zeroable,
11862                                     ArrayRef<int> Mask, SDValue &V1,
11863                                     SDValue &V2, SelectionDAG &DAG,
11864                                     const X86Subtarget &Subtarget) {
11865   bool IsLeftZeroSide = true;
11866   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11867                                 IsLeftZeroSide))
11868     return SDValue();
11869   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11870   MVT IntegerType =
11871       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11872   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11873   unsigned NumElts = VT.getVectorNumElements();
11874   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11875          "Unexpected number of vector elements");
11876   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11877                               Subtarget, DAG, DL);
11878   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11879   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11880   return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11881 }
11882 
11883 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11884                                   unsigned &UnpackOpcode, bool IsUnary,
11885                                   ArrayRef<int> TargetMask, const SDLoc &DL,
11886                                   SelectionDAG &DAG,
11887                                   const X86Subtarget &Subtarget) {
11888   int NumElts = VT.getVectorNumElements();
11889 
11890   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11891   for (int i = 0; i != NumElts; i += 2) {
11892     int M1 = TargetMask[i + 0];
11893     int M2 = TargetMask[i + 1];
11894     Undef1 &= (SM_SentinelUndef == M1);
11895     Undef2 &= (SM_SentinelUndef == M2);
11896     Zero1 &= isUndefOrZero(M1);
11897     Zero2 &= isUndefOrZero(M2);
11898   }
11899   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11900          "Zeroable shuffle detected");
11901 
11902   // Attempt to match the target mask against the unpack lo/hi mask patterns.
11903   SmallVector<int, 64> Unpckl, Unpckh;
11904   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11905   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11906                                 (IsUnary ? V1 : V2))) {
11907     UnpackOpcode = X86ISD::UNPCKL;
11908     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11909     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11910     return true;
11911   }
11912 
11913   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11914   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11915                                 (IsUnary ? V1 : V2))) {
11916     UnpackOpcode = X86ISD::UNPCKH;
11917     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11918     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11919     return true;
11920   }
11921 
11922   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11923   if (IsUnary && (Zero1 || Zero2)) {
11924     // Don't bother if we can blend instead.
11925     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11926         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11927       return false;
11928 
11929     bool MatchLo = true, MatchHi = true;
11930     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11931       int M = TargetMask[i];
11932 
11933       // Ignore if the input is known to be zero or the index is undef.
11934       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11935           (M == SM_SentinelUndef))
11936         continue;
11937 
11938       MatchLo &= (M == Unpckl[i]);
11939       MatchHi &= (M == Unpckh[i]);
11940     }
11941 
11942     if (MatchLo || MatchHi) {
11943       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11944       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11945       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11946       return true;
11947     }
11948   }
11949 
11950   // If a binary shuffle, commute and try again.
11951   if (!IsUnary) {
11952     ShuffleVectorSDNode::commuteMask(Unpckl);
11953     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11954       UnpackOpcode = X86ISD::UNPCKL;
11955       std::swap(V1, V2);
11956       return true;
11957     }
11958 
11959     ShuffleVectorSDNode::commuteMask(Unpckh);
11960     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11961       UnpackOpcode = X86ISD::UNPCKH;
11962       std::swap(V1, V2);
11963       return true;
11964     }
11965   }
11966 
11967   return false;
11968 }
11969 
11970 // X86 has dedicated unpack instructions that can handle specific blend
11971 // operations: UNPCKH and UNPCKL.
11972 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11973                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
11974                                      SelectionDAG &DAG) {
11975   SmallVector<int, 8> Unpckl;
11976   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11977   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11978     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11979 
11980   SmallVector<int, 8> Unpckh;
11981   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11982   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11983     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11984 
11985   // Commute and try again.
11986   ShuffleVectorSDNode::commuteMask(Unpckl);
11987   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11988     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11989 
11990   ShuffleVectorSDNode::commuteMask(Unpckh);
11991   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11992     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11993 
11994   return SDValue();
11995 }
11996 
11997 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11998 /// followed by unpack 256-bit.
11999 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12000                                         ArrayRef<int> Mask, SDValue V1,
12001                                         SDValue V2, SelectionDAG &DAG) {
12002   SmallVector<int, 32> Unpckl, Unpckh;
12003   createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12004   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12005 
12006   unsigned UnpackOpcode;
12007   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12008     UnpackOpcode = X86ISD::UNPCKL;
12009   else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12010     UnpackOpcode = X86ISD::UNPCKH;
12011   else
12012     return SDValue();
12013 
12014   // This is a "natural" unpack operation (rather than the 128-bit sectored
12015   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12016   // input in order to use the x86 instruction.
12017   V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12018                             DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12019   V1 = DAG.getBitcast(VT, V1);
12020   return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12021 }
12022 
12023 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12024 // source into the lower elements and zeroing the upper elements.
12025 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12026                                  ArrayRef<int> Mask, const APInt &Zeroable,
12027                                  const X86Subtarget &Subtarget) {
12028   if (!VT.is512BitVector() && !Subtarget.hasVLX())
12029     return false;
12030 
12031   unsigned NumElts = Mask.size();
12032   unsigned EltSizeInBits = VT.getScalarSizeInBits();
12033   unsigned MaxScale = 64 / EltSizeInBits;
12034 
12035   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12036     unsigned SrcEltBits = EltSizeInBits * Scale;
12037     if (SrcEltBits < 32 && !Subtarget.hasBWI())
12038       continue;
12039     unsigned NumSrcElts = NumElts / Scale;
12040     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12041       continue;
12042     unsigned UpperElts = NumElts - NumSrcElts;
12043     if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12044       continue;
12045     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12046     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12047     DstVT = MVT::getIntegerVT(EltSizeInBits);
12048     if ((NumSrcElts * EltSizeInBits) >= 128) {
12049       // ISD::TRUNCATE
12050       DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12051     } else {
12052       // X86ISD::VTRUNC
12053       DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12054     }
12055     return true;
12056   }
12057 
12058   return false;
12059 }
12060 
12061 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12062 // element padding to the final DstVT.
12063 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12064                                   const X86Subtarget &Subtarget,
12065                                   SelectionDAG &DAG, bool ZeroUppers) {
12066   MVT SrcVT = Src.getSimpleValueType();
12067   MVT DstSVT = DstVT.getScalarType();
12068   unsigned NumDstElts = DstVT.getVectorNumElements();
12069   unsigned NumSrcElts = SrcVT.getVectorNumElements();
12070   unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12071 
12072   if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12073     return SDValue();
12074 
12075   // Perform a direct ISD::TRUNCATE if possible.
12076   if (NumSrcElts == NumDstElts)
12077     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12078 
12079   if (NumSrcElts > NumDstElts) {
12080     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12081     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12082     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12083   }
12084 
12085   if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12086     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12087     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12088     return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12089                           DstVT.getSizeInBits());
12090   }
12091 
12092   // Non-VLX targets must truncate from a 512-bit type, so we need to
12093   // widen, truncate and then possibly extract the original subvector.
12094   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12095     SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12096     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12097   }
12098 
12099   // Fallback to a X86ISD::VTRUNC, padding if necessary.
12100   MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12101   SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12102   if (DstVT != TruncVT)
12103     Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12104                            DstVT.getSizeInBits());
12105   return Trunc;
12106 }
12107 
12108 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12109 //
12110 // An example is the following:
12111 //
12112 // t0: ch = EntryToken
12113 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12114 //         t25: v4i32 = truncate t2
12115 //       t41: v8i16 = bitcast t25
12116 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12117 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12118 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12119 //   t18: v2i64 = bitcast t51
12120 //
12121 // One can just use a single vpmovdw instruction, without avx512vl we need to
12122 // use the zmm variant and extract the lower subvector, padding with zeroes.
12123 // TODO: Merge with lowerShuffleAsVTRUNC.
12124 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12125                                      SDValue V2, ArrayRef<int> Mask,
12126                                      const APInt &Zeroable,
12127                                      const X86Subtarget &Subtarget,
12128                                      SelectionDAG &DAG) {
12129   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
12130   if (!Subtarget.hasAVX512())
12131     return SDValue();
12132 
12133   unsigned NumElts = VT.getVectorNumElements();
12134   unsigned EltSizeInBits = VT.getScalarSizeInBits();
12135   unsigned MaxScale = 64 / EltSizeInBits;
12136   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12137     unsigned NumSrcElts = NumElts / Scale;
12138     unsigned UpperElts = NumElts - NumSrcElts;
12139     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12140         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12141       continue;
12142 
12143     SDValue Src = V1;
12144     if (!Src.hasOneUse())
12145       return SDValue();
12146 
12147     Src = peekThroughOneUseBitcasts(Src);
12148     if (Src.getOpcode() != ISD::TRUNCATE ||
12149         Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
12150       return SDValue();
12151     Src = Src.getOperand(0);
12152 
12153     // VPMOVWB is only available with avx512bw.
12154     MVT SrcVT = Src.getSimpleValueType();
12155     if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
12156         !Subtarget.hasBWI())
12157       return SDValue();
12158 
12159     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12160     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12161   }
12162 
12163   return SDValue();
12164 }
12165 
12166 // Attempt to match binary shuffle patterns as a truncate.
12167 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12168                                     SDValue V2, ArrayRef<int> Mask,
12169                                     const APInt &Zeroable,
12170                                     const X86Subtarget &Subtarget,
12171                                     SelectionDAG &DAG) {
12172   assert((VT.is128BitVector() || VT.is256BitVector()) &&
12173          "Unexpected VTRUNC type");
12174   if (!Subtarget.hasAVX512())
12175     return SDValue();
12176 
12177   unsigned NumElts = VT.getVectorNumElements();
12178   unsigned EltSizeInBits = VT.getScalarSizeInBits();
12179   unsigned MaxScale = 64 / EltSizeInBits;
12180   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12181     // TODO: Support non-BWI VPMOVWB truncations?
12182     unsigned SrcEltBits = EltSizeInBits * Scale;
12183     if (SrcEltBits < 32 && !Subtarget.hasBWI())
12184       continue;
12185 
12186     // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
12187     // Bail if the V2 elements are undef.
12188     unsigned NumHalfSrcElts = NumElts / Scale;
12189     unsigned NumSrcElts = 2 * NumHalfSrcElts;
12190     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12191         isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12192       continue;
12193 
12194     // The elements beyond the truncation must be undef/zero.
12195     unsigned UpperElts = NumElts - NumSrcElts;
12196     if (UpperElts > 0 &&
12197         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12198       continue;
12199     bool UndefUppers =
12200         UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12201 
12202     // As we're using both sources then we need to concat them together
12203     // and truncate from the double-sized src.
12204     MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12205     SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12206 
12207     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12208     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12209     Src = DAG.getBitcast(SrcVT, Src);
12210     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12211   }
12212 
12213   return SDValue();
12214 }
12215 
12216 /// Check whether a compaction lowering can be done by dropping even/odd
12217 /// elements and compute how many times even/odd elements must be dropped.
12218 ///
12219 /// This handles shuffles which take every Nth element where N is a power of
12220 /// two. Example shuffle masks:
12221 ///
12222 /// (even)
12223 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
12224 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12225 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
12226 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
12227 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
12228 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
12229 ///
12230 /// (odd)
12231 ///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15,  0,  2,  4,  6,  8, 10, 12, 14
12232 ///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12233 ///
12234 /// Any of these lanes can of course be undef.
12235 ///
12236 /// This routine only supports N <= 3.
12237 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12238 /// for larger N.
12239 ///
12240 /// \returns N above, or the number of times even/odd elements must be dropped
12241 /// if there is such a number. Otherwise returns zero.
12242 static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12243                                       bool IsSingleInput) {
12244   // The modulus for the shuffle vector entries is based on whether this is
12245   // a single input or not.
12246   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12247   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12248          "We should only be called with masks with a power-of-2 size!");
12249 
12250   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12251   int Offset = MatchEven ? 0 : 1;
12252 
12253   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12254   // and 2^3 simultaneously. This is because we may have ambiguity with
12255   // partially undef inputs.
12256   bool ViableForN[3] = {true, true, true};
12257 
12258   for (int i = 0, e = Mask.size(); i < e; ++i) {
12259     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12260     // want.
12261     if (Mask[i] < 0)
12262       continue;
12263 
12264     bool IsAnyViable = false;
12265     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12266       if (ViableForN[j]) {
12267         uint64_t N = j + 1;
12268 
12269         // The shuffle mask must be equal to (i * 2^N) % M.
12270         if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12271           IsAnyViable = true;
12272         else
12273           ViableForN[j] = false;
12274       }
12275     // Early exit if we exhaust the possible powers of two.
12276     if (!IsAnyViable)
12277       break;
12278   }
12279 
12280   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12281     if (ViableForN[j])
12282       return j + 1;
12283 
12284   // Return 0 as there is no viable power of two.
12285   return 0;
12286 }
12287 
12288 // X86 has dedicated pack instructions that can handle specific truncation
12289 // operations: PACKSS and PACKUS.
12290 // Checks for compaction shuffle masks if MaxStages > 1.
12291 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
12292 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12293                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
12294                                  const SelectionDAG &DAG,
12295                                  const X86Subtarget &Subtarget,
12296                                  unsigned MaxStages = 1) {
12297   unsigned NumElts = VT.getVectorNumElements();
12298   unsigned BitSize = VT.getScalarSizeInBits();
12299   assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
12300          "Illegal maximum compaction");
12301 
12302   auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12303     unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12304     unsigned NumPackedBits = NumSrcBits - BitSize;
12305     N1 = peekThroughBitcasts(N1);
12306     N2 = peekThroughBitcasts(N2);
12307     unsigned NumBits1 = N1.getScalarValueSizeInBits();
12308     unsigned NumBits2 = N2.getScalarValueSizeInBits();
12309     bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12310     bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12311     if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12312         (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12313       return false;
12314     if (Subtarget.hasSSE41() || BitSize == 8) {
12315       APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12316       if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12317           (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12318         V1 = N1;
12319         V2 = N2;
12320         SrcVT = PackVT;
12321         PackOpcode = X86ISD::PACKUS;
12322         return true;
12323       }
12324     }
12325     bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12326     bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12327     if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12328          DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12329         (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12330          DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12331       V1 = N1;
12332       V2 = N2;
12333       SrcVT = PackVT;
12334       PackOpcode = X86ISD::PACKSS;
12335       return true;
12336     }
12337     return false;
12338   };
12339 
12340   // Attempt to match against wider and wider compaction patterns.
12341   for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12342     MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12343     MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12344 
12345     // Try binary shuffle.
12346     SmallVector<int, 32> BinaryMask;
12347     createPackShuffleMask(VT, BinaryMask, false, NumStages);
12348     if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
12349       if (MatchPACK(V1, V2, PackVT))
12350         return true;
12351 
12352     // Try unary shuffle.
12353     SmallVector<int, 32> UnaryMask;
12354     createPackShuffleMask(VT, UnaryMask, true, NumStages);
12355     if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
12356       if (MatchPACK(V1, V1, PackVT))
12357         return true;
12358   }
12359 
12360   return false;
12361 }
12362 
12363 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12364                                     SDValue V1, SDValue V2, SelectionDAG &DAG,
12365                                     const X86Subtarget &Subtarget) {
12366   MVT PackVT;
12367   unsigned PackOpcode;
12368   unsigned SizeBits = VT.getSizeInBits();
12369   unsigned EltBits = VT.getScalarSizeInBits();
12370   unsigned MaxStages = Log2_32(64 / EltBits);
12371   if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12372                             Subtarget, MaxStages))
12373     return SDValue();
12374 
12375   unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12376   unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12377 
12378   // Don't lower multi-stage packs on AVX512, truncation is better.
12379   if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12380     return SDValue();
12381 
12382   // Pack to the largest type possible:
12383   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12384   unsigned MaxPackBits = 16;
12385   if (CurrentEltBits > 16 &&
12386       (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12387     MaxPackBits = 32;
12388 
12389   // Repeatedly pack down to the target size.
12390   SDValue Res;
12391   for (unsigned i = 0; i != NumStages; ++i) {
12392     unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12393     unsigned NumSrcElts = SizeBits / SrcEltBits;
12394     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12395     MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12396     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12397     MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12398     Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12399                       DAG.getBitcast(SrcVT, V2));
12400     V1 = V2 = Res;
12401     CurrentEltBits /= 2;
12402   }
12403   assert(Res && Res.getValueType() == VT &&
12404          "Failed to lower compaction shuffle");
12405   return Res;
12406 }
12407 
12408 /// Try to emit a bitmask instruction for a shuffle.
12409 ///
12410 /// This handles cases where we can model a blend exactly as a bitmask due to
12411 /// one of the inputs being zeroable.
12412 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12413                                      SDValue V2, ArrayRef<int> Mask,
12414                                      const APInt &Zeroable,
12415                                      const X86Subtarget &Subtarget,
12416                                      SelectionDAG &DAG) {
12417   MVT MaskVT = VT;
12418   MVT EltVT = VT.getVectorElementType();
12419   SDValue Zero, AllOnes;
12420   // Use f64 if i64 isn't legal.
12421   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12422     EltVT = MVT::f64;
12423     MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12424   }
12425 
12426   MVT LogicVT = VT;
12427   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12428     Zero = DAG.getConstantFP(0.0, DL, EltVT);
12429     APFloat AllOnesValue =
12430         APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12431     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12432     LogicVT =
12433         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12434   } else {
12435     Zero = DAG.getConstant(0, DL, EltVT);
12436     AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12437   }
12438 
12439   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12440   SDValue V;
12441   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12442     if (Zeroable[i])
12443       continue;
12444     if (Mask[i] % Size != i)
12445       return SDValue(); // Not a blend.
12446     if (!V)
12447       V = Mask[i] < Size ? V1 : V2;
12448     else if (V != (Mask[i] < Size ? V1 : V2))
12449       return SDValue(); // Can only let one input through the mask.
12450 
12451     VMaskOps[i] = AllOnes;
12452   }
12453   if (!V)
12454     return SDValue(); // No non-zeroable elements!
12455 
12456   SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12457   VMask = DAG.getBitcast(LogicVT, VMask);
12458   V = DAG.getBitcast(LogicVT, V);
12459   SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12460   return DAG.getBitcast(VT, And);
12461 }
12462 
12463 /// Try to emit a blend instruction for a shuffle using bit math.
12464 ///
12465 /// This is used as a fallback approach when first class blend instructions are
12466 /// unavailable. Currently it is only suitable for integer vectors, but could
12467 /// be generalized for floating point vectors if desirable.
12468 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12469                                       SDValue V2, ArrayRef<int> Mask,
12470                                       SelectionDAG &DAG) {
12471   assert(VT.isInteger() && "Only supports integer vector types!");
12472   MVT EltVT = VT.getVectorElementType();
12473   SDValue Zero = DAG.getConstant(0, DL, EltVT);
12474   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12475   SmallVector<SDValue, 16> MaskOps;
12476   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12477     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12478       return SDValue(); // Shuffled input!
12479     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12480   }
12481 
12482   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12483   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12484   V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12485   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12486 }
12487 
12488 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12489                                     SDValue PreservedSrc,
12490                                     const X86Subtarget &Subtarget,
12491                                     SelectionDAG &DAG);
12492 
12493 static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12494                                 MutableArrayRef<int> Mask,
12495                                 const APInt &Zeroable, bool &ForceV1Zero,
12496                                 bool &ForceV2Zero, uint64_t &BlendMask) {
12497   bool V1IsZeroOrUndef =
12498       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12499   bool V2IsZeroOrUndef =
12500       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12501 
12502   BlendMask = 0;
12503   ForceV1Zero = false, ForceV2Zero = false;
12504   assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
12505 
12506   // Attempt to generate the binary blend mask. If an input is zero then
12507   // we can use any lane.
12508   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12509     int M = Mask[i];
12510     if (M == SM_SentinelUndef)
12511       continue;
12512     if (M == i ||
12513         (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12514       Mask[i] = i;
12515       continue;
12516     }
12517     if (M == (i + Size) ||
12518         (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
12519       BlendMask |= 1ull << i;
12520       Mask[i] = i + Size;
12521       continue;
12522     }
12523     if (Zeroable[i]) {
12524       if (V1IsZeroOrUndef) {
12525         ForceV1Zero = true;
12526         Mask[i] = i;
12527         continue;
12528       }
12529       if (V2IsZeroOrUndef) {
12530         ForceV2Zero = true;
12531         BlendMask |= 1ull << i;
12532         Mask[i] = i + Size;
12533         continue;
12534       }
12535     }
12536     return false;
12537   }
12538   return true;
12539 }
12540 
12541 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12542                                             int Scale) {
12543   uint64_t ScaledMask = 0;
12544   for (int i = 0; i != Size; ++i)
12545     if (BlendMask & (1ull << i))
12546       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12547   return ScaledMask;
12548 }
12549 
12550 /// Try to emit a blend instruction for a shuffle.
12551 ///
12552 /// This doesn't do any checks for the availability of instructions for blending
12553 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12554 /// be matched in the backend with the type given. What it does check for is
12555 /// that the shuffle mask is a blend, or convertible into a blend with zero.
12556 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12557                                    SDValue V2, ArrayRef<int> Original,
12558                                    const APInt &Zeroable,
12559                                    const X86Subtarget &Subtarget,
12560                                    SelectionDAG &DAG) {
12561   uint64_t BlendMask = 0;
12562   bool ForceV1Zero = false, ForceV2Zero = false;
12563   SmallVector<int, 64> Mask(Original.begin(), Original.end());
12564   if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12565                            BlendMask))
12566     return SDValue();
12567 
12568   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12569   if (ForceV1Zero)
12570     V1 = getZeroVector(VT, Subtarget, DAG, DL);
12571   if (ForceV2Zero)
12572     V2 = getZeroVector(VT, Subtarget, DAG, DL);
12573 
12574   unsigned NumElts = VT.getVectorNumElements();
12575 
12576   switch (VT.SimpleTy) {
12577   case MVT::v4i64:
12578   case MVT::v8i32:
12579     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
12580     LLVM_FALLTHROUGH;
12581   case MVT::v4f64:
12582   case MVT::v8f32:
12583     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
12584     LLVM_FALLTHROUGH;
12585   case MVT::v2f64:
12586   case MVT::v2i64:
12587   case MVT::v4f32:
12588   case MVT::v4i32:
12589   case MVT::v8i16:
12590     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
12591     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12592                        DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12593   case MVT::v16i16: {
12594     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
12595     SmallVector<int, 8> RepeatedMask;
12596     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12597       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12598       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
12599       BlendMask = 0;
12600       for (int i = 0; i < 8; ++i)
12601         if (RepeatedMask[i] >= 8)
12602           BlendMask |= 1ull << i;
12603       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12604                          DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12605     }
12606     // Use PBLENDW for lower/upper lanes and then blend lanes.
12607     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12608     // merge to VSELECT where useful.
12609     uint64_t LoMask = BlendMask & 0xFF;
12610     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12611     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12612       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12613                                DAG.getTargetConstant(LoMask, DL, MVT::i8));
12614       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12615                                DAG.getTargetConstant(HiMask, DL, MVT::i8));
12616       return DAG.getVectorShuffle(
12617           MVT::v16i16, DL, Lo, Hi,
12618           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12619     }
12620     LLVM_FALLTHROUGH;
12621   }
12622   case MVT::v32i8:
12623     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
12624     LLVM_FALLTHROUGH;
12625   case MVT::v16i8: {
12626     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
12627 
12628     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12629     if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12630                                                Subtarget, DAG))
12631       return Masked;
12632 
12633     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12634       MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
12635       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12636       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12637     }
12638 
12639     // If we have VPTERNLOG, we can use that as a bit blend.
12640     if (Subtarget.hasVLX())
12641       if (SDValue BitBlend =
12642               lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12643         return BitBlend;
12644 
12645     // Scale the blend by the number of bytes per element.
12646     int Scale = VT.getScalarSizeInBits() / 8;
12647 
12648     // This form of blend is always done on bytes. Compute the byte vector
12649     // type.
12650     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12651 
12652     // x86 allows load folding with blendvb from the 2nd source operand. But
12653     // we are still using LLVM select here (see comment below), so that's V1.
12654     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12655     // allow that load-folding possibility.
12656     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12657       ShuffleVectorSDNode::commuteMask(Mask);
12658       std::swap(V1, V2);
12659     }
12660 
12661     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12662     // mix of LLVM's code generator and the x86 backend. We tell the code
12663     // generator that boolean values in the elements of an x86 vector register
12664     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12665     // mapping a select to operand #1, and 'false' mapping to operand #2. The
12666     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12667     // of the element (the remaining are ignored) and 0 in that high bit would
12668     // mean operand #1 while 1 in the high bit would mean operand #2. So while
12669     // the LLVM model for boolean values in vector elements gets the relevant
12670     // bit set, it is set backwards and over constrained relative to x86's
12671     // actual model.
12672     SmallVector<SDValue, 32> VSELECTMask;
12673     for (int i = 0, Size = Mask.size(); i < Size; ++i)
12674       for (int j = 0; j < Scale; ++j)
12675         VSELECTMask.push_back(
12676             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12677                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12678                                           MVT::i8));
12679 
12680     V1 = DAG.getBitcast(BlendVT, V1);
12681     V2 = DAG.getBitcast(BlendVT, V2);
12682     return DAG.getBitcast(
12683         VT,
12684         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12685                       V1, V2));
12686   }
12687   case MVT::v16f32:
12688   case MVT::v8f64:
12689   case MVT::v8i64:
12690   case MVT::v16i32:
12691   case MVT::v32i16:
12692   case MVT::v64i8: {
12693     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12694     bool OptForSize = DAG.shouldOptForSize();
12695     if (!OptForSize) {
12696       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12697                                                  Subtarget, DAG))
12698         return Masked;
12699     }
12700 
12701     // Otherwise load an immediate into a GPR, cast to k-register, and use a
12702     // masked move.
12703     MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
12704     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12705     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12706   }
12707   default:
12708     llvm_unreachable("Not a supported integer vector type!");
12709   }
12710 }
12711 
12712 /// Try to lower as a blend of elements from two inputs followed by
12713 /// a single-input permutation.
12714 ///
12715 /// This matches the pattern where we can blend elements from two inputs and
12716 /// then reduce the shuffle to a single-input permutation.
12717 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12718                                              SDValue V1, SDValue V2,
12719                                              ArrayRef<int> Mask,
12720                                              SelectionDAG &DAG,
12721                                              bool ImmBlends = false) {
12722   // We build up the blend mask while checking whether a blend is a viable way
12723   // to reduce the shuffle.
12724   SmallVector<int, 32> BlendMask(Mask.size(), -1);
12725   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12726 
12727   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12728     if (Mask[i] < 0)
12729       continue;
12730 
12731     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
12732 
12733     if (BlendMask[Mask[i] % Size] < 0)
12734       BlendMask[Mask[i] % Size] = Mask[i];
12735     else if (BlendMask[Mask[i] % Size] != Mask[i])
12736       return SDValue(); // Can't blend in the needed input!
12737 
12738     PermuteMask[i] = Mask[i] % Size;
12739   }
12740 
12741   // If only immediate blends, then bail if the blend mask can't be widened to
12742   // i16.
12743   unsigned EltSize = VT.getScalarSizeInBits();
12744   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12745     return SDValue();
12746 
12747   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12748   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12749 }
12750 
12751 /// Try to lower as an unpack of elements from two inputs followed by
12752 /// a single-input permutation.
12753 ///
12754 /// This matches the pattern where we can unpack elements from two inputs and
12755 /// then reduce the shuffle to a single-input (wider) permutation.
12756 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12757                                              SDValue V1, SDValue V2,
12758                                              ArrayRef<int> Mask,
12759                                              SelectionDAG &DAG) {
12760   int NumElts = Mask.size();
12761   int NumLanes = VT.getSizeInBits() / 128;
12762   int NumLaneElts = NumElts / NumLanes;
12763   int NumHalfLaneElts = NumLaneElts / 2;
12764 
12765   bool MatchLo = true, MatchHi = true;
12766   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12767 
12768   // Determine UNPCKL/UNPCKH type and operand order.
12769   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12770     for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12771       int M = Mask[Lane + Elt];
12772       if (M < 0)
12773         continue;
12774 
12775       SDValue &Op = Ops[Elt & 1];
12776       if (M < NumElts && (Op.isUndef() || Op == V1))
12777         Op = V1;
12778       else if (NumElts <= M && (Op.isUndef() || Op == V2))
12779         Op = V2;
12780       else
12781         return SDValue();
12782 
12783       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12784       MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12785                  isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12786       MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12787                  isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12788       if (!MatchLo && !MatchHi)
12789         return SDValue();
12790     }
12791   }
12792   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
12793 
12794   // Now check that each pair of elts come from the same unpack pair
12795   // and set the permute mask based on each pair.
12796   // TODO - Investigate cases where we permute individual elements.
12797   SmallVector<int, 32> PermuteMask(NumElts, -1);
12798   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12799     for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12800       int M0 = Mask[Lane + Elt + 0];
12801       int M1 = Mask[Lane + Elt + 1];
12802       if (0 <= M0 && 0 <= M1 &&
12803           (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12804         return SDValue();
12805       if (0 <= M0)
12806         PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12807       if (0 <= M1)
12808         PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12809     }
12810   }
12811 
12812   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12813   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12814   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12815 }
12816 
12817 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12818 /// permuting the elements of the result in place.
12819 static SDValue lowerShuffleAsByteRotateAndPermute(
12820     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12821     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12822   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12823       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12824       (VT.is512BitVector() && !Subtarget.hasBWI()))
12825     return SDValue();
12826 
12827   // We don't currently support lane crossing permutes.
12828   if (is128BitLaneCrossingShuffleMask(VT, Mask))
12829     return SDValue();
12830 
12831   int Scale = VT.getScalarSizeInBits() / 8;
12832   int NumLanes = VT.getSizeInBits() / 128;
12833   int NumElts = VT.getVectorNumElements();
12834   int NumEltsPerLane = NumElts / NumLanes;
12835 
12836   // Determine range of mask elts.
12837   bool Blend1 = true;
12838   bool Blend2 = true;
12839   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12840   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12841   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12842     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12843       int M = Mask[Lane + Elt];
12844       if (M < 0)
12845         continue;
12846       if (M < NumElts) {
12847         Blend1 &= (M == (Lane + Elt));
12848         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12849         M = M % NumEltsPerLane;
12850         Range1.first = std::min(Range1.first, M);
12851         Range1.second = std::max(Range1.second, M);
12852       } else {
12853         M -= NumElts;
12854         Blend2 &= (M == (Lane + Elt));
12855         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12856         M = M % NumEltsPerLane;
12857         Range2.first = std::min(Range2.first, M);
12858         Range2.second = std::max(Range2.second, M);
12859       }
12860     }
12861   }
12862 
12863   // Bail if we don't need both elements.
12864   // TODO - it might be worth doing this for unary shuffles if the permute
12865   // can be widened.
12866   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12867       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12868     return SDValue();
12869 
12870   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12871     return SDValue();
12872 
12873   // Rotate the 2 ops so we can access both ranges, then permute the result.
12874   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12875     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12876     SDValue Rotate = DAG.getBitcast(
12877         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12878                         DAG.getBitcast(ByteVT, Lo),
12879                         DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12880     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12881     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12882       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12883         int M = Mask[Lane + Elt];
12884         if (M < 0)
12885           continue;
12886         if (M < NumElts)
12887           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12888         else
12889           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12890       }
12891     }
12892     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12893   };
12894 
12895   // Check if the ranges are small enough to rotate from either direction.
12896   if (Range2.second < Range1.first)
12897     return RotateAndPermute(V1, V2, Range1.first, 0);
12898   if (Range1.second < Range2.first)
12899     return RotateAndPermute(V2, V1, Range2.first, NumElts);
12900   return SDValue();
12901 }
12902 
12903 static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
12904   return isUndefOrEqual(Mask, 0);
12905 }
12906 
12907 static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
12908   return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
12909 }
12910 
12911 /// Generic routine to decompose a shuffle and blend into independent
12912 /// blends and permutes.
12913 ///
12914 /// This matches the extremely common pattern for handling combined
12915 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12916 /// operations. It will try to pick the best arrangement of shuffles and
12917 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12918 static SDValue lowerShuffleAsDecomposedShuffleMerge(
12919     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12920     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12921   int NumElts = Mask.size();
12922   int NumLanes = VT.getSizeInBits() / 128;
12923   int NumEltsPerLane = NumElts / NumLanes;
12924 
12925   // Shuffle the input elements into the desired positions in V1 and V2 and
12926   // unpack/blend them together.
12927   bool IsAlternating = true;
12928   SmallVector<int, 32> V1Mask(NumElts, -1);
12929   SmallVector<int, 32> V2Mask(NumElts, -1);
12930   SmallVector<int, 32> FinalMask(NumElts, -1);
12931   for (int i = 0; i < NumElts; ++i) {
12932     int M = Mask[i];
12933     if (M >= 0 && M < NumElts) {
12934       V1Mask[i] = M;
12935       FinalMask[i] = i;
12936       IsAlternating &= (i & 1) == 0;
12937     } else if (M >= NumElts) {
12938       V2Mask[i] = M - NumElts;
12939       FinalMask[i] = i + NumElts;
12940       IsAlternating &= (i & 1) == 1;
12941     }
12942   }
12943 
12944   // If we effectively only demand the 0'th element of \p Input, and not only
12945   // as 0'th element, then broadcast said input,
12946   // and change \p InputMask to be a no-op (identity) mask.
12947   auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
12948                                          &DAG](SDValue &Input,
12949                                                MutableArrayRef<int> InputMask) {
12950     unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
12951     if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
12952                                  !X86::mayFoldLoad(Input, Subtarget)))
12953       return;
12954     if (isNoopShuffleMask(InputMask))
12955       return;
12956     assert(isBroadcastShuffleMask(InputMask) &&
12957            "Expected to demand only the 0'th element.");
12958     Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
12959     for (auto I : enumerate(InputMask)) {
12960       int &InputMaskElt = I.value();
12961       if (InputMaskElt >= 0)
12962         InputMaskElt = I.index();
12963     }
12964   };
12965 
12966   // Currently, we may need to produce one shuffle per input, and blend results.
12967   // It is possible that the shuffle for one of the inputs is already a no-op.
12968   // See if we can simplify non-no-op shuffles into broadcasts,
12969   // which we consider to be strictly better than an arbitrary shuffle.
12970   if (isNoopOrBroadcastShuffleMask(V1Mask) &&
12971       isNoopOrBroadcastShuffleMask(V2Mask)) {
12972     canonicalizeBroadcastableInput(V1, V1Mask);
12973     canonicalizeBroadcastableInput(V2, V2Mask);
12974   }
12975 
12976   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12977   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12978   // the shuffle may be able to fold with a load or other benefit. However, when
12979   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12980   // pre-shuffle first is a better strategy.
12981   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12982     // Only prefer immediate blends to unpack/rotate.
12983     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12984                                                           DAG, true))
12985       return BlendPerm;
12986     if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12987                                                            DAG))
12988       return UnpackPerm;
12989     if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12990             DL, VT, V1, V2, Mask, Subtarget, DAG))
12991       return RotatePerm;
12992     // Unpack/rotate failed - try again with variable blends.
12993     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12994                                                           DAG))
12995       return BlendPerm;
12996   }
12997 
12998   // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12999   // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13000   // TODO: It doesn't have to be alternating - but each lane mustn't have more
13001   // than half the elements coming from each source.
13002   if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13003     V1Mask.assign(NumElts, -1);
13004     V2Mask.assign(NumElts, -1);
13005     FinalMask.assign(NumElts, -1);
13006     for (int i = 0; i != NumElts; i += NumEltsPerLane)
13007       for (int j = 0; j != NumEltsPerLane; ++j) {
13008         int M = Mask[i + j];
13009         if (M >= 0 && M < NumElts) {
13010           V1Mask[i + (j / 2)] = M;
13011           FinalMask[i + j] = i + (j / 2);
13012         } else if (M >= NumElts) {
13013           V2Mask[i + (j / 2)] = M - NumElts;
13014           FinalMask[i + j] = i + (j / 2) + NumElts;
13015         }
13016       }
13017   }
13018 
13019   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13020   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13021   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13022 }
13023 
13024 /// Try to lower a vector shuffle as a bit rotation.
13025 ///
13026 /// Look for a repeated rotation pattern in each sub group.
13027 /// Returns a ISD::ROTL element rotation amount or -1 if failed.
13028 static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13029   int NumElts = Mask.size();
13030   assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
13031 
13032   int RotateAmt = -1;
13033   for (int i = 0; i != NumElts; i += NumSubElts) {
13034     for (int j = 0; j != NumSubElts; ++j) {
13035       int M = Mask[i + j];
13036       if (M < 0)
13037         continue;
13038       if (!isInRange(M, i, i + NumSubElts))
13039         return -1;
13040       int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13041       if (0 <= RotateAmt && Offset != RotateAmt)
13042         return -1;
13043       RotateAmt = Offset;
13044     }
13045   }
13046   return RotateAmt;
13047 }
13048 
13049 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13050                                    const X86Subtarget &Subtarget,
13051                                    ArrayRef<int> Mask) {
13052   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
13053   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
13054 
13055   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13056   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13057   int MaxSubElts = 64 / EltSizeInBits;
13058   for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13059     int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13060     if (RotateAmt < 0)
13061       continue;
13062 
13063     int NumElts = Mask.size();
13064     MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13065     RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13066     return RotateAmt * EltSizeInBits;
13067   }
13068 
13069   return -1;
13070 }
13071 
13072 /// Lower shuffle using X86ISD::VROTLI rotations.
13073 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13074                                        ArrayRef<int> Mask,
13075                                        const X86Subtarget &Subtarget,
13076                                        SelectionDAG &DAG) {
13077   // Only XOP + AVX512 targets have bit rotation instructions.
13078   // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13079   bool IsLegal =
13080       (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13081   if (!IsLegal && Subtarget.hasSSE3())
13082     return SDValue();
13083 
13084   MVT RotateVT;
13085   int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13086                                           Subtarget, Mask);
13087   if (RotateAmt < 0)
13088     return SDValue();
13089 
13090   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13091   // expanded to OR(SRL,SHL), will be more efficient, but if they can
13092   // widen to vXi16 or more then existing lowering should will be better.
13093   if (!IsLegal) {
13094     if ((RotateAmt % 16) == 0)
13095       return SDValue();
13096     // TODO: Use getTargetVShiftByConstNode.
13097     unsigned ShlAmt = RotateAmt;
13098     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13099     V1 = DAG.getBitcast(RotateVT, V1);
13100     SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13101                               DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13102     SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13103                               DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13104     SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13105     return DAG.getBitcast(VT, Rot);
13106   }
13107 
13108   SDValue Rot =
13109       DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13110                   DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13111   return DAG.getBitcast(VT, Rot);
13112 }
13113 
13114 /// Try to match a vector shuffle as an element rotation.
13115 ///
13116 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13117 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13118                                        ArrayRef<int> Mask) {
13119   int NumElts = Mask.size();
13120 
13121   // We need to detect various ways of spelling a rotation:
13122   //   [11, 12, 13, 14, 15,  0,  1,  2]
13123   //   [-1, 12, 13, 14, -1, -1,  1, -1]
13124   //   [-1, -1, -1, -1, -1, -1,  1,  2]
13125   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
13126   //   [-1,  4,  5,  6, -1, -1,  9, -1]
13127   //   [-1,  4,  5,  6, -1, -1, -1, -1]
13128   int Rotation = 0;
13129   SDValue Lo, Hi;
13130   for (int i = 0; i < NumElts; ++i) {
13131     int M = Mask[i];
13132     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
13133            "Unexpected mask index.");
13134     if (M < 0)
13135       continue;
13136 
13137     // Determine where a rotated vector would have started.
13138     int StartIdx = i - (M % NumElts);
13139     if (StartIdx == 0)
13140       // The identity rotation isn't interesting, stop.
13141       return -1;
13142 
13143     // If we found the tail of a vector the rotation must be the missing
13144     // front. If we found the head of a vector, it must be how much of the
13145     // head.
13146     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13147 
13148     if (Rotation == 0)
13149       Rotation = CandidateRotation;
13150     else if (Rotation != CandidateRotation)
13151       // The rotations don't match, so we can't match this mask.
13152       return -1;
13153 
13154     // Compute which value this mask is pointing at.
13155     SDValue MaskV = M < NumElts ? V1 : V2;
13156 
13157     // Compute which of the two target values this index should be assigned
13158     // to. This reflects whether the high elements are remaining or the low
13159     // elements are remaining.
13160     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13161 
13162     // Either set up this value if we've not encountered it before, or check
13163     // that it remains consistent.
13164     if (!TargetV)
13165       TargetV = MaskV;
13166     else if (TargetV != MaskV)
13167       // This may be a rotation, but it pulls from the inputs in some
13168       // unsupported interleaving.
13169       return -1;
13170   }
13171 
13172   // Check that we successfully analyzed the mask, and normalize the results.
13173   assert(Rotation != 0 && "Failed to locate a viable rotation!");
13174   assert((Lo || Hi) && "Failed to find a rotated input vector!");
13175   if (!Lo)
13176     Lo = Hi;
13177   else if (!Hi)
13178     Hi = Lo;
13179 
13180   V1 = Lo;
13181   V2 = Hi;
13182 
13183   return Rotation;
13184 }
13185 
13186 /// Try to lower a vector shuffle as a byte rotation.
13187 ///
13188 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13189 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13190 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13191 /// try to generically lower a vector shuffle through such an pattern. It
13192 /// does not check for the profitability of lowering either as PALIGNR or
13193 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13194 /// This matches shuffle vectors that look like:
13195 ///
13196 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13197 ///
13198 /// Essentially it concatenates V1 and V2, shifts right by some number of
13199 /// elements, and takes the low elements as the result. Note that while this is
13200 /// specified as a *right shift* because x86 is little-endian, it is a *left
13201 /// rotate* of the vector lanes.
13202 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13203                                     ArrayRef<int> Mask) {
13204   // Don't accept any shuffles with zero elements.
13205   if (isAnyZero(Mask))
13206     return -1;
13207 
13208   // PALIGNR works on 128-bit lanes.
13209   SmallVector<int, 16> RepeatedMask;
13210   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13211     return -1;
13212 
13213   int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13214   if (Rotation <= 0)
13215     return -1;
13216 
13217   // PALIGNR rotates bytes, so we need to scale the
13218   // rotation based on how many bytes are in the vector lane.
13219   int NumElts = RepeatedMask.size();
13220   int Scale = 16 / NumElts;
13221   return Rotation * Scale;
13222 }
13223 
13224 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13225                                         SDValue V2, ArrayRef<int> Mask,
13226                                         const X86Subtarget &Subtarget,
13227                                         SelectionDAG &DAG) {
13228   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
13229 
13230   SDValue Lo = V1, Hi = V2;
13231   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13232   if (ByteRotation <= 0)
13233     return SDValue();
13234 
13235   // Cast the inputs to i8 vector of correct length to match PALIGNR or
13236   // PSLLDQ/PSRLDQ.
13237   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13238   Lo = DAG.getBitcast(ByteVT, Lo);
13239   Hi = DAG.getBitcast(ByteVT, Hi);
13240 
13241   // SSSE3 targets can use the palignr instruction.
13242   if (Subtarget.hasSSSE3()) {
13243     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
13244            "512-bit PALIGNR requires BWI instructions");
13245     return DAG.getBitcast(
13246         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13247                         DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13248   }
13249 
13250   assert(VT.is128BitVector() &&
13251          "Rotate-based lowering only supports 128-bit lowering!");
13252   assert(Mask.size() <= 16 &&
13253          "Can shuffle at most 16 bytes in a 128-bit vector!");
13254   assert(ByteVT == MVT::v16i8 &&
13255          "SSE2 rotate lowering only needed for v16i8!");
13256 
13257   // Default SSE2 implementation
13258   int LoByteShift = 16 - ByteRotation;
13259   int HiByteShift = ByteRotation;
13260 
13261   SDValue LoShift =
13262       DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13263                   DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13264   SDValue HiShift =
13265       DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13266                   DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13267   return DAG.getBitcast(VT,
13268                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13269 }
13270 
13271 /// Try to lower a vector shuffle as a dword/qword rotation.
13272 ///
13273 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13274 /// rotation of the concatenation of two vectors; This routine will
13275 /// try to generically lower a vector shuffle through such an pattern.
13276 ///
13277 /// Essentially it concatenates V1 and V2, shifts right by some number of
13278 /// elements, and takes the low elements as the result. Note that while this is
13279 /// specified as a *right shift* because x86 is little-endian, it is a *left
13280 /// rotate* of the vector lanes.
13281 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13282                                     SDValue V2, ArrayRef<int> Mask,
13283                                     const X86Subtarget &Subtarget,
13284                                     SelectionDAG &DAG) {
13285   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
13286          "Only 32-bit and 64-bit elements are supported!");
13287 
13288   // 128/256-bit vectors are only supported with VLX.
13289   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
13290          && "VLX required for 128/256-bit vectors");
13291 
13292   SDValue Lo = V1, Hi = V2;
13293   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13294   if (Rotation <= 0)
13295     return SDValue();
13296 
13297   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13298                      DAG.getTargetConstant(Rotation, DL, MVT::i8));
13299 }
13300 
13301 /// Try to lower a vector shuffle as a byte shift sequence.
13302 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13303                                            SDValue V2, ArrayRef<int> Mask,
13304                                            const APInt &Zeroable,
13305                                            const X86Subtarget &Subtarget,
13306                                            SelectionDAG &DAG) {
13307   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
13308   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
13309 
13310   // We need a shuffle that has zeros at one/both ends and a sequential
13311   // shuffle from one source within.
13312   unsigned ZeroLo = Zeroable.countTrailingOnes();
13313   unsigned ZeroHi = Zeroable.countLeadingOnes();
13314   if (!ZeroLo && !ZeroHi)
13315     return SDValue();
13316 
13317   unsigned NumElts = Mask.size();
13318   unsigned Len = NumElts - (ZeroLo + ZeroHi);
13319   if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13320     return SDValue();
13321 
13322   unsigned Scale = VT.getScalarSizeInBits() / 8;
13323   ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13324   if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13325       !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13326     return SDValue();
13327 
13328   SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13329   Res = DAG.getBitcast(MVT::v16i8, Res);
13330 
13331   // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13332   // inner sequential set of elements, possibly offset:
13333   // 01234567 --> zzzzzz01 --> 1zzzzzzz
13334   // 01234567 --> 4567zzzz --> zzzzz456
13335   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13336   if (ZeroLo == 0) {
13337     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13338     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13339                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13340     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13341                       DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13342   } else if (ZeroHi == 0) {
13343     unsigned Shift = Mask[ZeroLo] % NumElts;
13344     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13345                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13346     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13347                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13348   } else if (!Subtarget.hasSSSE3()) {
13349     // If we don't have PSHUFB then its worth avoiding an AND constant mask
13350     // by performing 3 byte shifts. Shuffle combining can kick in above that.
13351     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13352     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13353     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13354                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13355     Shift += Mask[ZeroLo] % NumElts;
13356     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13357                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13358     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13359                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13360   } else
13361     return SDValue();
13362 
13363   return DAG.getBitcast(VT, Res);
13364 }
13365 
13366 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
13367 ///
13368 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
13369 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
13370 /// matches elements from one of the input vectors shuffled to the left or
13371 /// right with zeroable elements 'shifted in'. It handles both the strictly
13372 /// bit-wise element shifts and the byte shift across an entire 128-bit double
13373 /// quad word lane.
13374 ///
13375 /// PSHL : (little-endian) left bit shift.
13376 /// [ zz, 0, zz,  2 ]
13377 /// [ -1, 4, zz, -1 ]
13378 /// PSRL : (little-endian) right bit shift.
13379 /// [  1, zz,  3, zz]
13380 /// [ -1, -1,  7, zz]
13381 /// PSLLDQ : (little-endian) left byte shift
13382 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
13383 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
13384 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
13385 /// PSRLDQ : (little-endian) right byte shift
13386 /// [  5, 6,  7, zz, zz, zz, zz, zz]
13387 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
13388 /// [  1, 2, -1, -1, -1, -1, zz, zz]
13389 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
13390                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
13391                                int MaskOffset, const APInt &Zeroable,
13392                                const X86Subtarget &Subtarget) {
13393   int Size = Mask.size();
13394   unsigned SizeInBits = Size * ScalarSizeInBits;
13395 
13396   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
13397     for (int i = 0; i < Size; i += Scale)
13398       for (int j = 0; j < Shift; ++j)
13399         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
13400           return false;
13401 
13402     return true;
13403   };
13404 
13405   auto MatchShift = [&](int Shift, int Scale, bool Left) {
13406     for (int i = 0; i != Size; i += Scale) {
13407       unsigned Pos = Left ? i + Shift : i;
13408       unsigned Low = Left ? i : i + Shift;
13409       unsigned Len = Scale - Shift;
13410       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
13411         return -1;
13412     }
13413 
13414     int ShiftEltBits = ScalarSizeInBits * Scale;
13415     bool ByteShift = ShiftEltBits > 64;
13416     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
13417                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
13418     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
13419 
13420     // Normalize the scale for byte shifts to still produce an i64 element
13421     // type.
13422     Scale = ByteShift ? Scale / 2 : Scale;
13423 
13424     // We need to round trip through the appropriate type for the shift.
13425     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
13426     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
13427                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
13428     return (int)ShiftAmt;
13429   };
13430 
13431   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
13432   // keep doubling the size of the integer elements up to that. We can
13433   // then shift the elements of the integer vector by whole multiples of
13434   // their width within the elements of the larger integer vector. Test each
13435   // multiple to see if we can find a match with the moved element indices
13436   // and that the shifted in elements are all zeroable.
13437   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
13438   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
13439     for (int Shift = 1; Shift != Scale; ++Shift)
13440       for (bool Left : {true, false})
13441         if (CheckZeros(Shift, Scale, Left)) {
13442           int ShiftAmt = MatchShift(Shift, Scale, Left);
13443           if (0 < ShiftAmt)
13444             return ShiftAmt;
13445         }
13446 
13447   // no match
13448   return -1;
13449 }
13450 
13451 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
13452                                    SDValue V2, ArrayRef<int> Mask,
13453                                    const APInt &Zeroable,
13454                                    const X86Subtarget &Subtarget,
13455                                    SelectionDAG &DAG) {
13456   int Size = Mask.size();
13457   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13458 
13459   MVT ShiftVT;
13460   SDValue V = V1;
13461   unsigned Opcode;
13462 
13463   // Try to match shuffle against V1 shift.
13464   int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13465                                      Mask, 0, Zeroable, Subtarget);
13466 
13467   // If V1 failed, try to match shuffle against V2 shift.
13468   if (ShiftAmt < 0) {
13469     ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13470                                    Mask, Size, Zeroable, Subtarget);
13471     V = V2;
13472   }
13473 
13474   if (ShiftAmt < 0)
13475     return SDValue();
13476 
13477   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
13478          "Illegal integer vector type");
13479   V = DAG.getBitcast(ShiftVT, V);
13480   V = DAG.getNode(Opcode, DL, ShiftVT, V,
13481                   DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
13482   return DAG.getBitcast(VT, V);
13483 }
13484 
13485 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
13486 // Remainder of lower half result is zero and upper half is all undef.
13487 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
13488                                 ArrayRef<int> Mask, uint64_t &BitLen,
13489                                 uint64_t &BitIdx, const APInt &Zeroable) {
13490   int Size = Mask.size();
13491   int HalfSize = Size / 2;
13492   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13493   assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
13494 
13495   // Upper half must be undefined.
13496   if (!isUndefUpperHalf(Mask))
13497     return false;
13498 
13499   // Determine the extraction length from the part of the
13500   // lower half that isn't zeroable.
13501   int Len = HalfSize;
13502   for (; Len > 0; --Len)
13503     if (!Zeroable[Len - 1])
13504       break;
13505   assert(Len > 0 && "Zeroable shuffle mask");
13506 
13507   // Attempt to match first Len sequential elements from the lower half.
13508   SDValue Src;
13509   int Idx = -1;
13510   for (int i = 0; i != Len; ++i) {
13511     int M = Mask[i];
13512     if (M == SM_SentinelUndef)
13513       continue;
13514     SDValue &V = (M < Size ? V1 : V2);
13515     M = M % Size;
13516 
13517     // The extracted elements must start at a valid index and all mask
13518     // elements must be in the lower half.
13519     if (i > M || M >= HalfSize)
13520       return false;
13521 
13522     if (Idx < 0 || (Src == V && Idx == (M - i))) {
13523       Src = V;
13524       Idx = M - i;
13525       continue;
13526     }
13527     return false;
13528   }
13529 
13530   if (!Src || Idx < 0)
13531     return false;
13532 
13533   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
13534   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13535   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13536   V1 = Src;
13537   return true;
13538 }
13539 
13540 // INSERTQ: Extract lowest Len elements from lower half of second source and
13541 // insert over first source, starting at Idx.
13542 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13543 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13544                                   ArrayRef<int> Mask, uint64_t &BitLen,
13545                                   uint64_t &BitIdx) {
13546   int Size = Mask.size();
13547   int HalfSize = Size / 2;
13548   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13549 
13550   // Upper half must be undefined.
13551   if (!isUndefUpperHalf(Mask))
13552     return false;
13553 
13554   for (int Idx = 0; Idx != HalfSize; ++Idx) {
13555     SDValue Base;
13556 
13557     // Attempt to match first source from mask before insertion point.
13558     if (isUndefInRange(Mask, 0, Idx)) {
13559       /* EMPTY */
13560     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13561       Base = V1;
13562     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13563       Base = V2;
13564     } else {
13565       continue;
13566     }
13567 
13568     // Extend the extraction length looking to match both the insertion of
13569     // the second source and the remaining elements of the first.
13570     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13571       SDValue Insert;
13572       int Len = Hi - Idx;
13573 
13574       // Match insertion.
13575       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13576         Insert = V1;
13577       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13578         Insert = V2;
13579       } else {
13580         continue;
13581       }
13582 
13583       // Match the remaining elements of the lower half.
13584       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13585         /* EMPTY */
13586       } else if ((!Base || (Base == V1)) &&
13587                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13588         Base = V1;
13589       } else if ((!Base || (Base == V2)) &&
13590                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13591                                             Size + Hi)) {
13592         Base = V2;
13593       } else {
13594         continue;
13595       }
13596 
13597       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13598       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13599       V1 = Base;
13600       V2 = Insert;
13601       return true;
13602     }
13603   }
13604 
13605   return false;
13606 }
13607 
13608 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13609 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13610                                      SDValue V2, ArrayRef<int> Mask,
13611                                      const APInt &Zeroable, SelectionDAG &DAG) {
13612   uint64_t BitLen, BitIdx;
13613   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13614     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13615                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13616                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13617 
13618   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13619     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13620                        V2 ? V2 : DAG.getUNDEF(VT),
13621                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13622                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13623 
13624   return SDValue();
13625 }
13626 
13627 /// Lower a vector shuffle as a zero or any extension.
13628 ///
13629 /// Given a specific number of elements, element bit width, and extension
13630 /// stride, produce either a zero or any extension based on the available
13631 /// features of the subtarget. The extended elements are consecutive and
13632 /// begin and can start from an offsetted element index in the input; to
13633 /// avoid excess shuffling the offset must either being in the bottom lane
13634 /// or at the start of a higher lane. All extended elements must be from
13635 /// the same lane.
13636 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13637     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13638     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13639   assert(Scale > 1 && "Need a scale to extend.");
13640   int EltBits = VT.getScalarSizeInBits();
13641   int NumElements = VT.getVectorNumElements();
13642   int NumEltsPerLane = 128 / EltBits;
13643   int OffsetLane = Offset / NumEltsPerLane;
13644   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13645          "Only 8, 16, and 32 bit elements can be extended.");
13646   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
13647   assert(0 <= Offset && "Extension offset must be positive.");
13648   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
13649          "Extension offset must be in the first lane or start an upper lane.");
13650 
13651   // Check that an index is in same lane as the base offset.
13652   auto SafeOffset = [&](int Idx) {
13653     return OffsetLane == (Idx / NumEltsPerLane);
13654   };
13655 
13656   // Shift along an input so that the offset base moves to the first element.
13657   auto ShuffleOffset = [&](SDValue V) {
13658     if (!Offset)
13659       return V;
13660 
13661     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13662     for (int i = 0; i * Scale < NumElements; ++i) {
13663       int SrcIdx = i + Offset;
13664       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13665     }
13666     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13667   };
13668 
13669   // Found a valid a/zext mask! Try various lowering strategies based on the
13670   // input type and available ISA extensions.
13671   if (Subtarget.hasSSE41()) {
13672     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13673     // PUNPCK will catch this in a later shuffle match.
13674     if (Offset && Scale == 2 && VT.is128BitVector())
13675       return SDValue();
13676     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13677                                  NumElements / Scale);
13678     InputV = ShuffleOffset(InputV);
13679     InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13680                                     DL, ExtVT, InputV, DAG);
13681     return DAG.getBitcast(VT, InputV);
13682   }
13683 
13684   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13685 
13686   // For any extends we can cheat for larger element sizes and use shuffle
13687   // instructions that can fold with a load and/or copy.
13688   if (AnyExt && EltBits == 32) {
13689     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13690                          -1};
13691     return DAG.getBitcast(
13692         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13693                         DAG.getBitcast(MVT::v4i32, InputV),
13694                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13695   }
13696   if (AnyExt && EltBits == 16 && Scale > 2) {
13697     int PSHUFDMask[4] = {Offset / 2, -1,
13698                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13699     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13700                          DAG.getBitcast(MVT::v4i32, InputV),
13701                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13702     int PSHUFWMask[4] = {1, -1, -1, -1};
13703     unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13704     return DAG.getBitcast(
13705         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13706                         DAG.getBitcast(MVT::v8i16, InputV),
13707                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13708   }
13709 
13710   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13711   // to 64-bits.
13712   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13713     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13714     assert(VT.is128BitVector() && "Unexpected vector width!");
13715 
13716     int LoIdx = Offset * EltBits;
13717     SDValue Lo = DAG.getBitcast(
13718         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13719                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13720                                 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13721 
13722     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13723       return DAG.getBitcast(VT, Lo);
13724 
13725     int HiIdx = (Offset + 1) * EltBits;
13726     SDValue Hi = DAG.getBitcast(
13727         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13728                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13729                                 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13730     return DAG.getBitcast(VT,
13731                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13732   }
13733 
13734   // If this would require more than 2 unpack instructions to expand, use
13735   // pshufb when available. We can only use more than 2 unpack instructions
13736   // when zero extending i8 elements which also makes it easier to use pshufb.
13737   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13738     assert(NumElements == 16 && "Unexpected byte vector width!");
13739     SDValue PSHUFBMask[16];
13740     for (int i = 0; i < 16; ++i) {
13741       int Idx = Offset + (i / Scale);
13742       if ((i % Scale == 0 && SafeOffset(Idx))) {
13743         PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13744         continue;
13745       }
13746       PSHUFBMask[i] =
13747           AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13748     }
13749     InputV = DAG.getBitcast(MVT::v16i8, InputV);
13750     return DAG.getBitcast(
13751         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13752                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13753   }
13754 
13755   // If we are extending from an offset, ensure we start on a boundary that
13756   // we can unpack from.
13757   int AlignToUnpack = Offset % (NumElements / Scale);
13758   if (AlignToUnpack) {
13759     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13760     for (int i = AlignToUnpack; i < NumElements; ++i)
13761       ShMask[i - AlignToUnpack] = i;
13762     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13763     Offset -= AlignToUnpack;
13764   }
13765 
13766   // Otherwise emit a sequence of unpacks.
13767   do {
13768     unsigned UnpackLoHi = X86ISD::UNPCKL;
13769     if (Offset >= (NumElements / 2)) {
13770       UnpackLoHi = X86ISD::UNPCKH;
13771       Offset -= (NumElements / 2);
13772     }
13773 
13774     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13775     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13776                          : getZeroVector(InputVT, Subtarget, DAG, DL);
13777     InputV = DAG.getBitcast(InputVT, InputV);
13778     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13779     Scale /= 2;
13780     EltBits *= 2;
13781     NumElements /= 2;
13782   } while (Scale > 1);
13783   return DAG.getBitcast(VT, InputV);
13784 }
13785 
13786 /// Try to lower a vector shuffle as a zero extension on any microarch.
13787 ///
13788 /// This routine will try to do everything in its power to cleverly lower
13789 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
13790 /// check for the profitability of this lowering,  it tries to aggressively
13791 /// match this pattern. It will use all of the micro-architectural details it
13792 /// can to emit an efficient lowering. It handles both blends with all-zero
13793 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13794 /// masking out later).
13795 ///
13796 /// The reason we have dedicated lowering for zext-style shuffles is that they
13797 /// are both incredibly common and often quite performance sensitive.
13798 static SDValue lowerShuffleAsZeroOrAnyExtend(
13799     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13800     const APInt &Zeroable, const X86Subtarget &Subtarget,
13801     SelectionDAG &DAG) {
13802   int Bits = VT.getSizeInBits();
13803   int NumLanes = Bits / 128;
13804   int NumElements = VT.getVectorNumElements();
13805   int NumEltsPerLane = NumElements / NumLanes;
13806   assert(VT.getScalarSizeInBits() <= 32 &&
13807          "Exceeds 32-bit integer zero extension limit");
13808   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13809 
13810   // Define a helper function to check a particular ext-scale and lower to it if
13811   // valid.
13812   auto Lower = [&](int Scale) -> SDValue {
13813     SDValue InputV;
13814     bool AnyExt = true;
13815     int Offset = 0;
13816     int Matches = 0;
13817     for (int i = 0; i < NumElements; ++i) {
13818       int M = Mask[i];
13819       if (M < 0)
13820         continue; // Valid anywhere but doesn't tell us anything.
13821       if (i % Scale != 0) {
13822         // Each of the extended elements need to be zeroable.
13823         if (!Zeroable[i])
13824           return SDValue();
13825 
13826         // We no longer are in the anyext case.
13827         AnyExt = false;
13828         continue;
13829       }
13830 
13831       // Each of the base elements needs to be consecutive indices into the
13832       // same input vector.
13833       SDValue V = M < NumElements ? V1 : V2;
13834       M = M % NumElements;
13835       if (!InputV) {
13836         InputV = V;
13837         Offset = M - (i / Scale);
13838       } else if (InputV != V)
13839         return SDValue(); // Flip-flopping inputs.
13840 
13841       // Offset must start in the lowest 128-bit lane or at the start of an
13842       // upper lane.
13843       // FIXME: Is it ever worth allowing a negative base offset?
13844       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13845             (Offset % NumEltsPerLane) == 0))
13846         return SDValue();
13847 
13848       // If we are offsetting, all referenced entries must come from the same
13849       // lane.
13850       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13851         return SDValue();
13852 
13853       if ((M % NumElements) != (Offset + (i / Scale)))
13854         return SDValue(); // Non-consecutive strided elements.
13855       Matches++;
13856     }
13857 
13858     // If we fail to find an input, we have a zero-shuffle which should always
13859     // have already been handled.
13860     // FIXME: Maybe handle this here in case during blending we end up with one?
13861     if (!InputV)
13862       return SDValue();
13863 
13864     // If we are offsetting, don't extend if we only match a single input, we
13865     // can always do better by using a basic PSHUF or PUNPCK.
13866     if (Offset != 0 && Matches < 2)
13867       return SDValue();
13868 
13869     return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13870                                                  InputV, Mask, Subtarget, DAG);
13871   };
13872 
13873   // The widest scale possible for extending is to a 64-bit integer.
13874   assert(Bits % 64 == 0 &&
13875          "The number of bits in a vector must be divisible by 64 on x86!");
13876   int NumExtElements = Bits / 64;
13877 
13878   // Each iteration, try extending the elements half as much, but into twice as
13879   // many elements.
13880   for (; NumExtElements < NumElements; NumExtElements *= 2) {
13881     assert(NumElements % NumExtElements == 0 &&
13882            "The input vector size must be divisible by the extended size.");
13883     if (SDValue V = Lower(NumElements / NumExtElements))
13884       return V;
13885   }
13886 
13887   // General extends failed, but 128-bit vectors may be able to use MOVQ.
13888   if (Bits != 128)
13889     return SDValue();
13890 
13891   // Returns one of the source operands if the shuffle can be reduced to a
13892   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13893   auto CanZExtLowHalf = [&]() {
13894     for (int i = NumElements / 2; i != NumElements; ++i)
13895       if (!Zeroable[i])
13896         return SDValue();
13897     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13898       return V1;
13899     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13900       return V2;
13901     return SDValue();
13902   };
13903 
13904   if (SDValue V = CanZExtLowHalf()) {
13905     V = DAG.getBitcast(MVT::v2i64, V);
13906     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13907     return DAG.getBitcast(VT, V);
13908   }
13909 
13910   // No viable ext lowering found.
13911   return SDValue();
13912 }
13913 
13914 /// Try to get a scalar value for a specific element of a vector.
13915 ///
13916 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13917 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13918                                               SelectionDAG &DAG) {
13919   MVT VT = V.getSimpleValueType();
13920   MVT EltVT = VT.getVectorElementType();
13921   V = peekThroughBitcasts(V);
13922 
13923   // If the bitcasts shift the element size, we can't extract an equivalent
13924   // element from it.
13925   MVT NewVT = V.getSimpleValueType();
13926   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13927     return SDValue();
13928 
13929   if (V.getOpcode() == ISD::BUILD_VECTOR ||
13930       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13931     // Ensure the scalar operand is the same size as the destination.
13932     // FIXME: Add support for scalar truncation where possible.
13933     SDValue S = V.getOperand(Idx);
13934     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13935       return DAG.getBitcast(EltVT, S);
13936   }
13937 
13938   return SDValue();
13939 }
13940 
13941 /// Helper to test for a load that can be folded with x86 shuffles.
13942 ///
13943 /// This is particularly important because the set of instructions varies
13944 /// significantly based on whether the operand is a load or not.
13945 static bool isShuffleFoldableLoad(SDValue V) {
13946   V = peekThroughBitcasts(V);
13947   return ISD::isNON_EXTLoad(V.getNode());
13948 }
13949 
13950 /// Try to lower insertion of a single element into a zero vector.
13951 ///
13952 /// This is a common pattern that we have especially efficient patterns to lower
13953 /// across all subtarget feature sets.
13954 static SDValue lowerShuffleAsElementInsertion(
13955     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13956     const APInt &Zeroable, const X86Subtarget &Subtarget,
13957     SelectionDAG &DAG) {
13958   MVT ExtVT = VT;
13959   MVT EltVT = VT.getVectorElementType();
13960 
13961   int V2Index =
13962       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13963       Mask.begin();
13964   bool IsV1Zeroable = true;
13965   for (int i = 0, Size = Mask.size(); i < Size; ++i)
13966     if (i != V2Index && !Zeroable[i]) {
13967       IsV1Zeroable = false;
13968       break;
13969     }
13970 
13971   // Check for a single input from a SCALAR_TO_VECTOR node.
13972   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13973   // all the smarts here sunk into that routine. However, the current
13974   // lowering of BUILD_VECTOR makes that nearly impossible until the old
13975   // vector shuffle lowering is dead.
13976   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13977                                                DAG);
13978   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13979     // We need to zext the scalar if it is smaller than an i32.
13980     V2S = DAG.getBitcast(EltVT, V2S);
13981     if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13982       // Using zext to expand a narrow element won't work for non-zero
13983       // insertions.
13984       if (!IsV1Zeroable)
13985         return SDValue();
13986 
13987       // Zero-extend directly to i32.
13988       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13989       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13990     }
13991     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13992   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13993              EltVT == MVT::i16) {
13994     // Either not inserting from the low element of the input or the input
13995     // element size is too small to use VZEXT_MOVL to clear the high bits.
13996     return SDValue();
13997   }
13998 
13999   if (!IsV1Zeroable) {
14000     // If V1 can't be treated as a zero vector we have fewer options to lower
14001     // this. We can't support integer vectors or non-zero targets cheaply, and
14002     // the V1 elements can't be permuted in any way.
14003     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
14004     if (!VT.isFloatingPoint() || V2Index != 0)
14005       return SDValue();
14006     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
14007     V1Mask[V2Index] = -1;
14008     if (!isNoopShuffleMask(V1Mask))
14009       return SDValue();
14010     if (!VT.is128BitVector())
14011       return SDValue();
14012 
14013     // Otherwise, use MOVSD, MOVSS or MOVSH.
14014     unsigned MovOpc = 0;
14015     if (EltVT == MVT::f16)
14016       MovOpc = X86ISD::MOVSH;
14017     else if (EltVT == MVT::f32)
14018       MovOpc = X86ISD::MOVSS;
14019     else if (EltVT == MVT::f64)
14020       MovOpc = X86ISD::MOVSD;
14021     else
14022       llvm_unreachable("Unsupported floating point element type to handle!");
14023     return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14024   }
14025 
14026   // This lowering only works for the low element with floating point vectors.
14027   if (VT.isFloatingPoint() && V2Index != 0)
14028     return SDValue();
14029 
14030   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14031   if (ExtVT != VT)
14032     V2 = DAG.getBitcast(VT, V2);
14033 
14034   if (V2Index != 0) {
14035     // If we have 4 or fewer lanes we can cheaply shuffle the element into
14036     // the desired position. Otherwise it is more efficient to do a vector
14037     // shift left. We know that we can do a vector shift left because all
14038     // the inputs are zero.
14039     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
14040       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14041       V2Shuffle[V2Index] = 0;
14042       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14043     } else {
14044       V2 = DAG.getBitcast(MVT::v16i8, V2);
14045       V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14046                        DAG.getTargetConstant(
14047                            V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
14048       V2 = DAG.getBitcast(VT, V2);
14049     }
14050   }
14051   return V2;
14052 }
14053 
14054 /// Try to lower broadcast of a single - truncated - integer element,
14055 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14056 ///
14057 /// This assumes we have AVX2.
14058 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14059                                             int BroadcastIdx,
14060                                             const X86Subtarget &Subtarget,
14061                                             SelectionDAG &DAG) {
14062   assert(Subtarget.hasAVX2() &&
14063          "We can only lower integer broadcasts with AVX2!");
14064 
14065   MVT EltVT = VT.getVectorElementType();
14066   MVT V0VT = V0.getSimpleValueType();
14067 
14068   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
14069   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
14070 
14071   MVT V0EltVT = V0VT.getVectorElementType();
14072   if (!V0EltVT.isInteger())
14073     return SDValue();
14074 
14075   const unsigned EltSize = EltVT.getSizeInBits();
14076   const unsigned V0EltSize = V0EltVT.getSizeInBits();
14077 
14078   // This is only a truncation if the original element type is larger.
14079   if (V0EltSize <= EltSize)
14080     return SDValue();
14081 
14082   assert(((V0EltSize % EltSize) == 0) &&
14083          "Scalar type sizes must all be powers of 2 on x86!");
14084 
14085   const unsigned V0Opc = V0.getOpcode();
14086   const unsigned Scale = V0EltSize / EltSize;
14087   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14088 
14089   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14090       V0Opc != ISD::BUILD_VECTOR)
14091     return SDValue();
14092 
14093   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14094 
14095   // If we're extracting non-least-significant bits, shift so we can truncate.
14096   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14097   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14098   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14099   if (const int OffsetIdx = BroadcastIdx % Scale)
14100     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14101                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14102 
14103   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14104                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14105 }
14106 
14107 /// Test whether this can be lowered with a single SHUFPS instruction.
14108 ///
14109 /// This is used to disable more specialized lowerings when the shufps lowering
14110 /// will happen to be efficient.
14111 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14112   // This routine only handles 128-bit shufps.
14113   assert(Mask.size() == 4 && "Unsupported mask size!");
14114   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
14115   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
14116   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
14117   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
14118 
14119   // To lower with a single SHUFPS we need to have the low half and high half
14120   // each requiring a single input.
14121   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14122     return false;
14123   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14124     return false;
14125 
14126   return true;
14127 }
14128 
14129 /// If we are extracting two 128-bit halves of a vector and shuffling the
14130 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14131 /// multi-shuffle lowering.
14132 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14133                                              SDValue N1, ArrayRef<int> Mask,
14134                                              SelectionDAG &DAG) {
14135   MVT VT = N0.getSimpleValueType();
14136   assert((VT.is128BitVector() &&
14137           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
14138          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
14139 
14140   // Check that both sources are extracts of the same source vector.
14141   if (!N0.hasOneUse() || !N1.hasOneUse() ||
14142       N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14143       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14144       N0.getOperand(0) != N1.getOperand(0))
14145     return SDValue();
14146 
14147   SDValue WideVec = N0.getOperand(0);
14148   MVT WideVT = WideVec.getSimpleValueType();
14149   if (!WideVT.is256BitVector())
14150     return SDValue();
14151 
14152   // Match extracts of each half of the wide source vector. Commute the shuffle
14153   // if the extract of the low half is N1.
14154   unsigned NumElts = VT.getVectorNumElements();
14155   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14156   const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14157   const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14158   if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14159     ShuffleVectorSDNode::commuteMask(NewMask);
14160   else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14161     return SDValue();
14162 
14163   // Final bailout: if the mask is simple, we are better off using an extract
14164   // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14165   // because that avoids a constant load from memory.
14166   if (NumElts == 4 &&
14167       (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
14168     return SDValue();
14169 
14170   // Extend the shuffle mask with undef elements.
14171   NewMask.append(NumElts, -1);
14172 
14173   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14174   SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14175                                       NewMask);
14176   // This is free: ymm -> xmm.
14177   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14178                      DAG.getIntPtrConstant(0, DL));
14179 }
14180 
14181 /// Try to lower broadcast of a single element.
14182 ///
14183 /// For convenience, this code also bundles all of the subtarget feature set
14184 /// filtering. While a little annoying to re-dispatch on type here, there isn't
14185 /// a convenient way to factor it out.
14186 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14187                                        SDValue V2, ArrayRef<int> Mask,
14188                                        const X86Subtarget &Subtarget,
14189                                        SelectionDAG &DAG) {
14190   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14191         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14192         (Subtarget.hasAVX2() && VT.isInteger())))
14193     return SDValue();
14194 
14195   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14196   // we can only broadcast from a register with AVX2.
14197   unsigned NumEltBits = VT.getScalarSizeInBits();
14198   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14199                         ? X86ISD::MOVDDUP
14200                         : X86ISD::VBROADCAST;
14201   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14202 
14203   // Check that the mask is a broadcast.
14204   int BroadcastIdx = getSplatIndex(Mask);
14205   if (BroadcastIdx < 0)
14206     return SDValue();
14207   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
14208                                             "a sorted mask where the broadcast "
14209                                             "comes from V1.");
14210 
14211   // Go up the chain of (vector) values to find a scalar load that we can
14212   // combine with the broadcast.
14213   // TODO: Combine this logic with findEltLoadSrc() used by
14214   //       EltsFromConsecutiveLoads().
14215   int BitOffset = BroadcastIdx * NumEltBits;
14216   SDValue V = V1;
14217   for (;;) {
14218     switch (V.getOpcode()) {
14219     case ISD::BITCAST: {
14220       V = V.getOperand(0);
14221       continue;
14222     }
14223     case ISD::CONCAT_VECTORS: {
14224       int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14225       int OpIdx = BitOffset / OpBitWidth;
14226       V = V.getOperand(OpIdx);
14227       BitOffset %= OpBitWidth;
14228       continue;
14229     }
14230     case ISD::EXTRACT_SUBVECTOR: {
14231       // The extraction index adds to the existing offset.
14232       unsigned EltBitWidth = V.getScalarValueSizeInBits();
14233       unsigned Idx = V.getConstantOperandVal(1);
14234       unsigned BeginOffset = Idx * EltBitWidth;
14235       BitOffset += BeginOffset;
14236       V = V.getOperand(0);
14237       continue;
14238     }
14239     case ISD::INSERT_SUBVECTOR: {
14240       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
14241       int EltBitWidth = VOuter.getScalarValueSizeInBits();
14242       int Idx = (int)V.getConstantOperandVal(2);
14243       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
14244       int BeginOffset = Idx * EltBitWidth;
14245       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
14246       if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
14247         BitOffset -= BeginOffset;
14248         V = VInner;
14249       } else {
14250         V = VOuter;
14251       }
14252       continue;
14253     }
14254     }
14255     break;
14256   }
14257   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
14258   BroadcastIdx = BitOffset / NumEltBits;
14259 
14260   // Do we need to bitcast the source to retrieve the original broadcast index?
14261   bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
14262 
14263   // Check if this is a broadcast of a scalar. We special case lowering
14264   // for scalars so that we can more effectively fold with loads.
14265   // If the original value has a larger element type than the shuffle, the
14266   // broadcast element is in essence truncated. Make that explicit to ease
14267   // folding.
14268   if (BitCastSrc && VT.isInteger())
14269     if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
14270             DL, VT, V, BroadcastIdx, Subtarget, DAG))
14271       return TruncBroadcast;
14272 
14273   // Also check the simpler case, where we can directly reuse the scalar.
14274   if (!BitCastSrc &&
14275       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14276        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14277     V = V.getOperand(BroadcastIdx);
14278 
14279     // If we can't broadcast from a register, check that the input is a load.
14280     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14281       return SDValue();
14282   } else if (ISD::isNormalLoad(V.getNode()) &&
14283              cast<LoadSDNode>(V)->isSimple()) {
14284     // We do not check for one-use of the vector load because a broadcast load
14285     // is expected to be a win for code size, register pressure, and possibly
14286     // uops even if the original vector load is not eliminated.
14287 
14288     // Reduce the vector load and shuffle to a broadcasted scalar load.
14289     LoadSDNode *Ld = cast<LoadSDNode>(V);
14290     SDValue BaseAddr = Ld->getOperand(1);
14291     MVT SVT = VT.getScalarType();
14292     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14293     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
14294     SDValue NewAddr =
14295         DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14296 
14297     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14298     // than MOVDDUP.
14299     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14300     if (Opcode == X86ISD::VBROADCAST) {
14301       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14302       SDValue Ops[] = {Ld->getChain(), NewAddr};
14303       V = DAG.getMemIntrinsicNode(
14304           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14305           DAG.getMachineFunction().getMachineMemOperand(
14306               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14307       DAG.makeEquivalentMemoryOrdering(Ld, V);
14308       return DAG.getBitcast(VT, V);
14309     }
14310     assert(SVT == MVT::f64 && "Unexpected VT!");
14311     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14312                     DAG.getMachineFunction().getMachineMemOperand(
14313                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14314     DAG.makeEquivalentMemoryOrdering(Ld, V);
14315   } else if (!BroadcastFromReg) {
14316     // We can't broadcast from a vector register.
14317     return SDValue();
14318   } else if (BitOffset != 0) {
14319     // We can only broadcast from the zero-element of a vector register,
14320     // but it can be advantageous to broadcast from the zero-element of a
14321     // subvector.
14322     if (!VT.is256BitVector() && !VT.is512BitVector())
14323       return SDValue();
14324 
14325     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
14326     if (VT == MVT::v4f64 || VT == MVT::v4i64)
14327       return SDValue();
14328 
14329     // Only broadcast the zero-element of a 128-bit subvector.
14330     if ((BitOffset % 128) != 0)
14331       return SDValue();
14332 
14333     assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
14334            "Unexpected bit-offset");
14335     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
14336            "Unexpected vector size");
14337     unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
14338     V = extract128BitVector(V, ExtractIdx, DAG, DL);
14339   }
14340 
14341   // On AVX we can use VBROADCAST directly for scalar sources.
14342   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
14343     V = DAG.getBitcast(MVT::f64, V);
14344     if (Subtarget.hasAVX()) {
14345       V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
14346       return DAG.getBitcast(VT, V);
14347     }
14348     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
14349   }
14350 
14351   // If this is a scalar, do the broadcast on this type and bitcast.
14352   if (!V.getValueType().isVector()) {
14353     assert(V.getScalarValueSizeInBits() == NumEltBits &&
14354            "Unexpected scalar size");
14355     MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
14356                                        VT.getVectorNumElements());
14357     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
14358   }
14359 
14360   // We only support broadcasting from 128-bit vectors to minimize the
14361   // number of patterns we need to deal with in isel. So extract down to
14362   // 128-bits, removing as many bitcasts as possible.
14363   if (V.getValueSizeInBits() > 128)
14364     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
14365 
14366   // Otherwise cast V to a vector with the same element type as VT, but
14367   // possibly narrower than VT. Then perform the broadcast.
14368   unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
14369   MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
14370   return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
14371 }
14372 
14373 // Check for whether we can use INSERTPS to perform the shuffle. We only use
14374 // INSERTPS when the V1 elements are already in the correct locations
14375 // because otherwise we can just always use two SHUFPS instructions which
14376 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
14377 // perform INSERTPS if a single V1 element is out of place and all V2
14378 // elements are zeroable.
14379 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
14380                                    unsigned &InsertPSMask,
14381                                    const APInt &Zeroable,
14382                                    ArrayRef<int> Mask, SelectionDAG &DAG) {
14383   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
14384   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
14385   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14386 
14387   // Attempt to match INSERTPS with one element from VA or VB being
14388   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
14389   // are updated.
14390   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
14391                              ArrayRef<int> CandidateMask) {
14392     unsigned ZMask = 0;
14393     int VADstIndex = -1;
14394     int VBDstIndex = -1;
14395     bool VAUsedInPlace = false;
14396 
14397     for (int i = 0; i < 4; ++i) {
14398       // Synthesize a zero mask from the zeroable elements (includes undefs).
14399       if (Zeroable[i]) {
14400         ZMask |= 1 << i;
14401         continue;
14402       }
14403 
14404       // Flag if we use any VA inputs in place.
14405       if (i == CandidateMask[i]) {
14406         VAUsedInPlace = true;
14407         continue;
14408       }
14409 
14410       // We can only insert a single non-zeroable element.
14411       if (VADstIndex >= 0 || VBDstIndex >= 0)
14412         return false;
14413 
14414       if (CandidateMask[i] < 4) {
14415         // VA input out of place for insertion.
14416         VADstIndex = i;
14417       } else {
14418         // VB input for insertion.
14419         VBDstIndex = i;
14420       }
14421     }
14422 
14423     // Don't bother if we have no (non-zeroable) element for insertion.
14424     if (VADstIndex < 0 && VBDstIndex < 0)
14425       return false;
14426 
14427     // Determine element insertion src/dst indices. The src index is from the
14428     // start of the inserted vector, not the start of the concatenated vector.
14429     unsigned VBSrcIndex = 0;
14430     if (VADstIndex >= 0) {
14431       // If we have a VA input out of place, we use VA as the V2 element
14432       // insertion and don't use the original V2 at all.
14433       VBSrcIndex = CandidateMask[VADstIndex];
14434       VBDstIndex = VADstIndex;
14435       VB = VA;
14436     } else {
14437       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
14438     }
14439 
14440     // If no V1 inputs are used in place, then the result is created only from
14441     // the zero mask and the V2 insertion - so remove V1 dependency.
14442     if (!VAUsedInPlace)
14443       VA = DAG.getUNDEF(MVT::v4f32);
14444 
14445     // Update V1, V2 and InsertPSMask accordingly.
14446     V1 = VA;
14447     V2 = VB;
14448 
14449     // Insert the V2 element into the desired position.
14450     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
14451     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
14452     return true;
14453   };
14454 
14455   if (matchAsInsertPS(V1, V2, Mask))
14456     return true;
14457 
14458   // Commute and try again.
14459   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
14460   ShuffleVectorSDNode::commuteMask(CommutedMask);
14461   if (matchAsInsertPS(V2, V1, CommutedMask))
14462     return true;
14463 
14464   return false;
14465 }
14466 
14467 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
14468                                       ArrayRef<int> Mask, const APInt &Zeroable,
14469                                       SelectionDAG &DAG) {
14470   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14471   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14472 
14473   // Attempt to match the insertps pattern.
14474   unsigned InsertPSMask = 0;
14475   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
14476     return SDValue();
14477 
14478   // Insert the V2 element into the desired position.
14479   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
14480                      DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
14481 }
14482 
14483 /// Try to lower a shuffle as a permute of the inputs followed by an
14484 /// UNPCK instruction.
14485 ///
14486 /// This specifically targets cases where we end up with alternating between
14487 /// the two inputs, and so can permute them into something that feeds a single
14488 /// UNPCK instruction. Note that this routine only targets integer vectors
14489 /// because for floating point vectors we have a generalized SHUFPS lowering
14490 /// strategy that handles everything that doesn't *exactly* match an unpack,
14491 /// making this clever lowering unnecessary.
14492 static SDValue lowerShuffleAsPermuteAndUnpack(
14493     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14494     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14495   assert(!VT.isFloatingPoint() &&
14496          "This routine only supports integer vectors.");
14497   assert(VT.is128BitVector() &&
14498          "This routine only works on 128-bit vectors.");
14499   assert(!V2.isUndef() &&
14500          "This routine should only be used when blending two inputs.");
14501   assert(Mask.size() >= 2 && "Single element masks are invalid.");
14502 
14503   int Size = Mask.size();
14504 
14505   int NumLoInputs =
14506       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
14507   int NumHiInputs =
14508       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
14509 
14510   bool UnpackLo = NumLoInputs >= NumHiInputs;
14511 
14512   auto TryUnpack = [&](int ScalarSize, int Scale) {
14513     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
14514     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
14515 
14516     for (int i = 0; i < Size; ++i) {
14517       if (Mask[i] < 0)
14518         continue;
14519 
14520       // Each element of the unpack contains Scale elements from this mask.
14521       int UnpackIdx = i / Scale;
14522 
14523       // We only handle the case where V1 feeds the first slots of the unpack.
14524       // We rely on canonicalization to ensure this is the case.
14525       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14526         return SDValue();
14527 
14528       // Setup the mask for this input. The indexing is tricky as we have to
14529       // handle the unpack stride.
14530       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14531       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14532           Mask[i] % Size;
14533     }
14534 
14535     // If we will have to shuffle both inputs to use the unpack, check whether
14536     // we can just unpack first and shuffle the result. If so, skip this unpack.
14537     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14538         !isNoopShuffleMask(V2Mask))
14539       return SDValue();
14540 
14541     // Shuffle the inputs into place.
14542     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14543     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14544 
14545     // Cast the inputs to the type we will use to unpack them.
14546     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14547     V1 = DAG.getBitcast(UnpackVT, V1);
14548     V2 = DAG.getBitcast(UnpackVT, V2);
14549 
14550     // Unpack the inputs and cast the result back to the desired type.
14551     return DAG.getBitcast(
14552         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14553                         UnpackVT, V1, V2));
14554   };
14555 
14556   // We try each unpack from the largest to the smallest to try and find one
14557   // that fits this mask.
14558   int OrigScalarSize = VT.getScalarSizeInBits();
14559   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14560     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14561       return Unpack;
14562 
14563   // If we're shuffling with a zero vector then we're better off not doing
14564   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14565   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14566       ISD::isBuildVectorAllZeros(V2.getNode()))
14567     return SDValue();
14568 
14569   // If none of the unpack-rooted lowerings worked (or were profitable) try an
14570   // initial unpack.
14571   if (NumLoInputs == 0 || NumHiInputs == 0) {
14572     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
14573            "We have to have *some* inputs!");
14574     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14575 
14576     // FIXME: We could consider the total complexity of the permute of each
14577     // possible unpacking. Or at the least we should consider how many
14578     // half-crossings are created.
14579     // FIXME: We could consider commuting the unpacks.
14580 
14581     SmallVector<int, 32> PermMask((unsigned)Size, -1);
14582     for (int i = 0; i < Size; ++i) {
14583       if (Mask[i] < 0)
14584         continue;
14585 
14586       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
14587 
14588       PermMask[i] =
14589           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14590     }
14591     return DAG.getVectorShuffle(
14592         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14593                             DL, VT, V1, V2),
14594         DAG.getUNDEF(VT), PermMask);
14595   }
14596 
14597   return SDValue();
14598 }
14599 
14600 /// Handle lowering of 2-lane 64-bit floating point shuffles.
14601 ///
14602 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
14603 /// support for floating point shuffles but not integer shuffles. These
14604 /// instructions will incur a domain crossing penalty on some chips though so
14605 /// it is better to avoid lowering through this for integer vectors where
14606 /// possible.
14607 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14608                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14609                                  const X86Subtarget &Subtarget,
14610                                  SelectionDAG &DAG) {
14611   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14612   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14613   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14614 
14615   if (V2.isUndef()) {
14616     // Check for being able to broadcast a single element.
14617     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14618                                                     Mask, Subtarget, DAG))
14619       return Broadcast;
14620 
14621     // Straight shuffle of a single input vector. Simulate this by using the
14622     // single input as both of the "inputs" to this instruction..
14623     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14624 
14625     if (Subtarget.hasAVX()) {
14626       // If we have AVX, we can use VPERMILPS which will allow folding a load
14627       // into the shuffle.
14628       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14629                          DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14630     }
14631 
14632     return DAG.getNode(
14633         X86ISD::SHUFP, DL, MVT::v2f64,
14634         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14635         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14636         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14637   }
14638   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14639   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14640   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14641   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14642 
14643   if (Subtarget.hasAVX2())
14644     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14645       return Extract;
14646 
14647   // When loading a scalar and then shuffling it into a vector we can often do
14648   // the insertion cheaply.
14649   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14650           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14651     return Insertion;
14652   // Try inverting the insertion since for v2 masks it is easy to do and we
14653   // can't reliably sort the mask one way or the other.
14654   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14655                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14656   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14657           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14658     return Insertion;
14659 
14660   // Try to use one of the special instruction patterns to handle two common
14661   // blend patterns if a zero-blend above didn't work.
14662   if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14663       isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14664     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14665       // We can either use a special instruction to load over the low double or
14666       // to move just the low double.
14667       return DAG.getNode(
14668           X86ISD::MOVSD, DL, MVT::v2f64, V2,
14669           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14670 
14671   if (Subtarget.hasSSE41())
14672     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14673                                             Zeroable, Subtarget, DAG))
14674       return Blend;
14675 
14676   // Use dedicated unpack instructions for masks that match their pattern.
14677   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14678     return V;
14679 
14680   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14681   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14682                      DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14683 }
14684 
14685 /// Handle lowering of 2-lane 64-bit integer shuffles.
14686 ///
14687 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14688 /// the integer unit to minimize domain crossing penalties. However, for blends
14689 /// it falls back to the floating point shuffle operation with appropriate bit
14690 /// casting.
14691 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14692                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14693                                  const X86Subtarget &Subtarget,
14694                                  SelectionDAG &DAG) {
14695   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14696   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14697   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14698 
14699   if (V2.isUndef()) {
14700     // Check for being able to broadcast a single element.
14701     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14702                                                     Mask, Subtarget, DAG))
14703       return Broadcast;
14704 
14705     // Straight shuffle of a single input vector. For everything from SSE2
14706     // onward this has a single fast instruction with no scary immediates.
14707     // We have to map the mask as it is actually a v4i32 shuffle instruction.
14708     V1 = DAG.getBitcast(MVT::v4i32, V1);
14709     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14710                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14711                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
14712                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14713     return DAG.getBitcast(
14714         MVT::v2i64,
14715         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14716                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14717   }
14718   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14719   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14720   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14721   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14722 
14723   if (Subtarget.hasAVX2())
14724     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14725       return Extract;
14726 
14727   // Try to use shift instructions.
14728   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14729                                           Zeroable, Subtarget, DAG))
14730     return Shift;
14731 
14732   // When loading a scalar and then shuffling it into a vector we can often do
14733   // the insertion cheaply.
14734   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14735           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14736     return Insertion;
14737   // Try inverting the insertion since for v2 masks it is easy to do and we
14738   // can't reliably sort the mask one way or the other.
14739   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14740   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14741           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14742     return Insertion;
14743 
14744   // We have different paths for blend lowering, but they all must use the
14745   // *exact* same predicate.
14746   bool IsBlendSupported = Subtarget.hasSSE41();
14747   if (IsBlendSupported)
14748     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14749                                             Zeroable, Subtarget, DAG))
14750       return Blend;
14751 
14752   // Use dedicated unpack instructions for masks that match their pattern.
14753   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14754     return V;
14755 
14756   // Try to use byte rotation instructions.
14757   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14758   if (Subtarget.hasSSSE3()) {
14759     if (Subtarget.hasVLX())
14760       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14761                                                 Subtarget, DAG))
14762         return Rotate;
14763 
14764     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14765                                                   Subtarget, DAG))
14766       return Rotate;
14767   }
14768 
14769   // If we have direct support for blends, we should lower by decomposing into
14770   // a permute. That will be faster than the domain cross.
14771   if (IsBlendSupported)
14772     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14773                                                 Subtarget, DAG);
14774 
14775   // We implement this with SHUFPD which is pretty lame because it will likely
14776   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14777   // However, all the alternatives are still more cycles and newer chips don't
14778   // have this problem. It would be really nice if x86 had better shuffles here.
14779   V1 = DAG.getBitcast(MVT::v2f64, V1);
14780   V2 = DAG.getBitcast(MVT::v2f64, V2);
14781   return DAG.getBitcast(MVT::v2i64,
14782                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14783 }
14784 
14785 /// Lower a vector shuffle using the SHUFPS instruction.
14786 ///
14787 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14788 /// It makes no assumptions about whether this is the *best* lowering, it simply
14789 /// uses it.
14790 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14791                                       ArrayRef<int> Mask, SDValue V1,
14792                                       SDValue V2, SelectionDAG &DAG) {
14793   SDValue LowV = V1, HighV = V2;
14794   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14795   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14796 
14797   if (NumV2Elements == 1) {
14798     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14799 
14800     // Compute the index adjacent to V2Index and in the same half by toggling
14801     // the low bit.
14802     int V2AdjIndex = V2Index ^ 1;
14803 
14804     if (Mask[V2AdjIndex] < 0) {
14805       // Handles all the cases where we have a single V2 element and an undef.
14806       // This will only ever happen in the high lanes because we commute the
14807       // vector otherwise.
14808       if (V2Index < 2)
14809         std::swap(LowV, HighV);
14810       NewMask[V2Index] -= 4;
14811     } else {
14812       // Handle the case where the V2 element ends up adjacent to a V1 element.
14813       // To make this work, blend them together as the first step.
14814       int V1Index = V2AdjIndex;
14815       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14816       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14817                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14818 
14819       // Now proceed to reconstruct the final blend as we have the necessary
14820       // high or low half formed.
14821       if (V2Index < 2) {
14822         LowV = V2;
14823         HighV = V1;
14824       } else {
14825         HighV = V2;
14826       }
14827       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14828       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14829     }
14830   } else if (NumV2Elements == 2) {
14831     if (Mask[0] < 4 && Mask[1] < 4) {
14832       // Handle the easy case where we have V1 in the low lanes and V2 in the
14833       // high lanes.
14834       NewMask[2] -= 4;
14835       NewMask[3] -= 4;
14836     } else if (Mask[2] < 4 && Mask[3] < 4) {
14837       // We also handle the reversed case because this utility may get called
14838       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14839       // arrange things in the right direction.
14840       NewMask[0] -= 4;
14841       NewMask[1] -= 4;
14842       HighV = V1;
14843       LowV = V2;
14844     } else {
14845       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14846       // trying to place elements directly, just blend them and set up the final
14847       // shuffle to place them.
14848 
14849       // The first two blend mask elements are for V1, the second two are for
14850       // V2.
14851       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14852                           Mask[2] < 4 ? Mask[2] : Mask[3],
14853                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14854                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14855       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14856                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14857 
14858       // Now we do a normal shuffle of V1 by giving V1 as both operands to
14859       // a blend.
14860       LowV = HighV = V1;
14861       NewMask[0] = Mask[0] < 4 ? 0 : 2;
14862       NewMask[1] = Mask[0] < 4 ? 2 : 0;
14863       NewMask[2] = Mask[2] < 4 ? 1 : 3;
14864       NewMask[3] = Mask[2] < 4 ? 3 : 1;
14865     }
14866   } else if (NumV2Elements == 3) {
14867     // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14868     // we can get here due to other paths (e.g repeated mask matching) that we
14869     // don't want to do another round of lowerVECTOR_SHUFFLE.
14870     ShuffleVectorSDNode::commuteMask(NewMask);
14871     return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14872   }
14873   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14874                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14875 }
14876 
14877 /// Lower 4-lane 32-bit floating point shuffles.
14878 ///
14879 /// Uses instructions exclusively from the floating point unit to minimize
14880 /// domain crossing penalties, as these are sufficient to implement all v4f32
14881 /// shuffles.
14882 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14883                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14884                                  const X86Subtarget &Subtarget,
14885                                  SelectionDAG &DAG) {
14886   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14887   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14888   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14889 
14890   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14891 
14892   if (NumV2Elements == 0) {
14893     // Check for being able to broadcast a single element.
14894     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14895                                                     Mask, Subtarget, DAG))
14896       return Broadcast;
14897 
14898     // Use even/odd duplicate instructions for masks that match their pattern.
14899     if (Subtarget.hasSSE3()) {
14900       if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14901         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14902       if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14903         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14904     }
14905 
14906     if (Subtarget.hasAVX()) {
14907       // If we have AVX, we can use VPERMILPS which will allow folding a load
14908       // into the shuffle.
14909       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14910                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14911     }
14912 
14913     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14914     // in SSE1 because otherwise they are widened to v2f64 and never get here.
14915     if (!Subtarget.hasSSE2()) {
14916       if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14917         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14918       if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14919         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14920     }
14921 
14922     // Otherwise, use a straight shuffle of a single input vector. We pass the
14923     // input vector to both operands to simulate this with a SHUFPS.
14924     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14925                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14926   }
14927 
14928   if (Subtarget.hasAVX2())
14929     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14930       return Extract;
14931 
14932   // There are special ways we can lower some single-element blends. However, we
14933   // have custom ways we can lower more complex single-element blends below that
14934   // we defer to if both this and BLENDPS fail to match, so restrict this to
14935   // when the V2 input is targeting element 0 of the mask -- that is the fast
14936   // case here.
14937   if (NumV2Elements == 1 && Mask[0] >= 4)
14938     if (SDValue V = lowerShuffleAsElementInsertion(
14939             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14940       return V;
14941 
14942   if (Subtarget.hasSSE41()) {
14943     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14944                                             Zeroable, Subtarget, DAG))
14945       return Blend;
14946 
14947     // Use INSERTPS if we can complete the shuffle efficiently.
14948     if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14949       return V;
14950 
14951     if (!isSingleSHUFPSMask(Mask))
14952       if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14953                                                             V2, Mask, DAG))
14954         return BlendPerm;
14955   }
14956 
14957   // Use low/high mov instructions. These are only valid in SSE1 because
14958   // otherwise they are widened to v2f64 and never get here.
14959   if (!Subtarget.hasSSE2()) {
14960     if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14961       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14962     if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14963       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14964   }
14965 
14966   // Use dedicated unpack instructions for masks that match their pattern.
14967   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14968     return V;
14969 
14970   // Otherwise fall back to a SHUFPS lowering strategy.
14971   return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14972 }
14973 
14974 /// Lower 4-lane i32 vector shuffles.
14975 ///
14976 /// We try to handle these with integer-domain shuffles where we can, but for
14977 /// blends we use the floating point domain blend instructions.
14978 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14979                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14980                                  const X86Subtarget &Subtarget,
14981                                  SelectionDAG &DAG) {
14982   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14983   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14984   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14985 
14986   // Whenever we can lower this as a zext, that instruction is strictly faster
14987   // than any alternative. It also allows us to fold memory operands into the
14988   // shuffle in many cases.
14989   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14990                                                    Zeroable, Subtarget, DAG))
14991     return ZExt;
14992 
14993   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14994 
14995   if (NumV2Elements == 0) {
14996     // Try to use broadcast unless the mask only has one non-undef element.
14997     if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14998       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14999                                                       Mask, Subtarget, DAG))
15000         return Broadcast;
15001     }
15002 
15003     // Straight shuffle of a single input vector. For everything from SSE2
15004     // onward this has a single fast instruction with no scary immediates.
15005     // We coerce the shuffle pattern to be compatible with UNPCK instructions
15006     // but we aren't actually going to use the UNPCK instruction because doing
15007     // so prevents folding a load into this instruction or making a copy.
15008     const int UnpackLoMask[] = {0, 0, 1, 1};
15009     const int UnpackHiMask[] = {2, 2, 3, 3};
15010     if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15011       Mask = UnpackLoMask;
15012     else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15013       Mask = UnpackHiMask;
15014 
15015     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15016                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15017   }
15018 
15019   if (Subtarget.hasAVX2())
15020     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15021       return Extract;
15022 
15023   // Try to use shift instructions.
15024   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
15025                                           Zeroable, Subtarget, DAG))
15026     return Shift;
15027 
15028   // There are special ways we can lower some single-element blends.
15029   if (NumV2Elements == 1)
15030     if (SDValue V = lowerShuffleAsElementInsertion(
15031             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15032       return V;
15033 
15034   // We have different paths for blend lowering, but they all must use the
15035   // *exact* same predicate.
15036   bool IsBlendSupported = Subtarget.hasSSE41();
15037   if (IsBlendSupported)
15038     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15039                                             Zeroable, Subtarget, DAG))
15040       return Blend;
15041 
15042   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15043                                              Zeroable, Subtarget, DAG))
15044     return Masked;
15045 
15046   // Use dedicated unpack instructions for masks that match their pattern.
15047   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15048     return V;
15049 
15050   // Try to use byte rotation instructions.
15051   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15052   if (Subtarget.hasSSSE3()) {
15053     if (Subtarget.hasVLX())
15054       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15055                                                 Subtarget, DAG))
15056         return Rotate;
15057 
15058     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15059                                                   Subtarget, DAG))
15060       return Rotate;
15061   }
15062 
15063   // Assume that a single SHUFPS is faster than an alternative sequence of
15064   // multiple instructions (even if the CPU has a domain penalty).
15065   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15066   if (!isSingleSHUFPSMask(Mask)) {
15067     // If we have direct support for blends, we should lower by decomposing into
15068     // a permute. That will be faster than the domain cross.
15069     if (IsBlendSupported)
15070       return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15071                                                   Subtarget, DAG);
15072 
15073     // Try to lower by permuting the inputs into an unpack instruction.
15074     if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15075                                                         Mask, Subtarget, DAG))
15076       return Unpack;
15077   }
15078 
15079   // We implement this with SHUFPS because it can blend from two vectors.
15080   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15081   // up the inputs, bypassing domain shift penalties that we would incur if we
15082   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15083   // relevant.
15084   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15085   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15086   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15087   return DAG.getBitcast(MVT::v4i32, ShufPS);
15088 }
15089 
15090 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15091 /// shuffle lowering, and the most complex part.
15092 ///
15093 /// The lowering strategy is to try to form pairs of input lanes which are
15094 /// targeted at the same half of the final vector, and then use a dword shuffle
15095 /// to place them onto the right half, and finally unpack the paired lanes into
15096 /// their final position.
15097 ///
15098 /// The exact breakdown of how to form these dword pairs and align them on the
15099 /// correct sides is really tricky. See the comments within the function for
15100 /// more of the details.
15101 ///
15102 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15103 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15104 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15105 /// vector, form the analogous 128-bit 8-element Mask.
15106 static SDValue lowerV8I16GeneralSingleInputShuffle(
15107     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15108     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15109   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
15110   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15111 
15112   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
15113   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15114   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15115 
15116   // Attempt to directly match PSHUFLW or PSHUFHW.
15117   if (isUndefOrInRange(LoMask, 0, 4) &&
15118       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15119     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15120                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15121   }
15122   if (isUndefOrInRange(HiMask, 4, 8) &&
15123       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15124     for (int i = 0; i != 4; ++i)
15125       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15126     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15127                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15128   }
15129 
15130   SmallVector<int, 4> LoInputs;
15131   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15132   array_pod_sort(LoInputs.begin(), LoInputs.end());
15133   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15134   SmallVector<int, 4> HiInputs;
15135   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15136   array_pod_sort(HiInputs.begin(), HiInputs.end());
15137   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15138   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15139   int NumHToL = LoInputs.size() - NumLToL;
15140   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15141   int NumHToH = HiInputs.size() - NumLToH;
15142   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15143   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15144   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15145   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15146 
15147   // If we are shuffling values from one half - check how many different DWORD
15148   // pairs we need to create. If only 1 or 2 then we can perform this as a
15149   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15150   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15151                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15152     V = DAG.getNode(ShufWOp, DL, VT, V,
15153                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15154     V = DAG.getBitcast(PSHUFDVT, V);
15155     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15156                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15157     return DAG.getBitcast(VT, V);
15158   };
15159 
15160   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15161     int PSHUFDMask[4] = { -1, -1, -1, -1 };
15162     SmallVector<std::pair<int, int>, 4> DWordPairs;
15163     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15164 
15165     // Collect the different DWORD pairs.
15166     for (int DWord = 0; DWord != 4; ++DWord) {
15167       int M0 = Mask[2 * DWord + 0];
15168       int M1 = Mask[2 * DWord + 1];
15169       M0 = (M0 >= 0 ? M0 % 4 : M0);
15170       M1 = (M1 >= 0 ? M1 % 4 : M1);
15171       if (M0 < 0 && M1 < 0)
15172         continue;
15173 
15174       bool Match = false;
15175       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15176         auto &DWordPair = DWordPairs[j];
15177         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15178             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15179           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15180           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15181           PSHUFDMask[DWord] = DOffset + j;
15182           Match = true;
15183           break;
15184         }
15185       }
15186       if (!Match) {
15187         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15188         DWordPairs.push_back(std::make_pair(M0, M1));
15189       }
15190     }
15191 
15192     if (DWordPairs.size() <= 2) {
15193       DWordPairs.resize(2, std::make_pair(-1, -1));
15194       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15195                               DWordPairs[1].first, DWordPairs[1].second};
15196       if ((NumHToL + NumHToH) == 0)
15197         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15198       if ((NumLToL + NumLToH) == 0)
15199         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15200     }
15201   }
15202 
15203   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15204   // such inputs we can swap two of the dwords across the half mark and end up
15205   // with <=2 inputs to each half in each half. Once there, we can fall through
15206   // to the generic code below. For example:
15207   //
15208   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15209   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15210   //
15211   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15212   // and an existing 2-into-2 on the other half. In this case we may have to
15213   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15214   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15215   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15216   // because any other situation (including a 3-into-1 or 1-into-3 in the other
15217   // half than the one we target for fixing) will be fixed when we re-enter this
15218   // path. We will also combine away any sequence of PSHUFD instructions that
15219   // result into a single instruction. Here is an example of the tricky case:
15220   //
15221   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15222   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15223   //
15224   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15225   //
15226   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15227   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15228   //
15229   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15230   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15231   //
15232   // The result is fine to be handled by the generic logic.
15233   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15234                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15235                           int AOffset, int BOffset) {
15236     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
15237            "Must call this with A having 3 or 1 inputs from the A half.");
15238     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
15239            "Must call this with B having 1 or 3 inputs from the B half.");
15240     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
15241            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
15242 
15243     bool ThreeAInputs = AToAInputs.size() == 3;
15244 
15245     // Compute the index of dword with only one word among the three inputs in
15246     // a half by taking the sum of the half with three inputs and subtracting
15247     // the sum of the actual three inputs. The difference is the remaining
15248     // slot.
15249     int ADWord = 0, BDWord = 0;
15250     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15251     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15252     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15253     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15254     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15255     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15256     int TripleNonInputIdx =
15257         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15258     TripleDWord = TripleNonInputIdx / 2;
15259 
15260     // We use xor with one to compute the adjacent DWord to whichever one the
15261     // OneInput is in.
15262     OneInputDWord = (OneInput / 2) ^ 1;
15263 
15264     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15265     // and BToA inputs. If there is also such a problem with the BToB and AToB
15266     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15267     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15268     // is essential that we don't *create* a 3<-1 as then we might oscillate.
15269     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15270       // Compute how many inputs will be flipped by swapping these DWords. We
15271       // need
15272       // to balance this to ensure we don't form a 3-1 shuffle in the other
15273       // half.
15274       int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15275                                  llvm::count(AToBInputs, 2 * ADWord + 1);
15276       int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15277                                  llvm::count(BToBInputs, 2 * BDWord + 1);
15278       if ((NumFlippedAToBInputs == 1 &&
15279            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15280           (NumFlippedBToBInputs == 1 &&
15281            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15282         // We choose whether to fix the A half or B half based on whether that
15283         // half has zero flipped inputs. At zero, we may not be able to fix it
15284         // with that half. We also bias towards fixing the B half because that
15285         // will more commonly be the high half, and we have to bias one way.
15286         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15287                                                        ArrayRef<int> Inputs) {
15288           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15289           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15290           // Determine whether the free index is in the flipped dword or the
15291           // unflipped dword based on where the pinned index is. We use this bit
15292           // in an xor to conditionally select the adjacent dword.
15293           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15294           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15295           if (IsFixIdxInput == IsFixFreeIdxInput)
15296             FixFreeIdx += 1;
15297           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15298           assert(IsFixIdxInput != IsFixFreeIdxInput &&
15299                  "We need to be changing the number of flipped inputs!");
15300           int PSHUFHalfMask[] = {0, 1, 2, 3};
15301           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15302           V = DAG.getNode(
15303               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15304               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15305               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15306 
15307           for (int &M : Mask)
15308             if (M >= 0 && M == FixIdx)
15309               M = FixFreeIdx;
15310             else if (M >= 0 && M == FixFreeIdx)
15311               M = FixIdx;
15312         };
15313         if (NumFlippedBToBInputs != 0) {
15314           int BPinnedIdx =
15315               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15316           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15317         } else {
15318           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
15319           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15320           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15321         }
15322       }
15323     }
15324 
15325     int PSHUFDMask[] = {0, 1, 2, 3};
15326     PSHUFDMask[ADWord] = BDWord;
15327     PSHUFDMask[BDWord] = ADWord;
15328     V = DAG.getBitcast(
15329         VT,
15330         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15331                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15332 
15333     // Adjust the mask to match the new locations of A and B.
15334     for (int &M : Mask)
15335       if (M >= 0 && M/2 == ADWord)
15336         M = 2 * BDWord + M % 2;
15337       else if (M >= 0 && M/2 == BDWord)
15338         M = 2 * ADWord + M % 2;
15339 
15340     // Recurse back into this routine to re-compute state now that this isn't
15341     // a 3 and 1 problem.
15342     return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15343   };
15344   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15345     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15346   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15347     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15348 
15349   // At this point there are at most two inputs to the low and high halves from
15350   // each half. That means the inputs can always be grouped into dwords and
15351   // those dwords can then be moved to the correct half with a dword shuffle.
15352   // We use at most one low and one high word shuffle to collect these paired
15353   // inputs into dwords, and finally a dword shuffle to place them.
15354   int PSHUFLMask[4] = {-1, -1, -1, -1};
15355   int PSHUFHMask[4] = {-1, -1, -1, -1};
15356   int PSHUFDMask[4] = {-1, -1, -1, -1};
15357 
15358   // First fix the masks for all the inputs that are staying in their
15359   // original halves. This will then dictate the targets of the cross-half
15360   // shuffles.
15361   auto fixInPlaceInputs =
15362       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15363                     MutableArrayRef<int> SourceHalfMask,
15364                     MutableArrayRef<int> HalfMask, int HalfOffset) {
15365     if (InPlaceInputs.empty())
15366       return;
15367     if (InPlaceInputs.size() == 1) {
15368       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15369           InPlaceInputs[0] - HalfOffset;
15370       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15371       return;
15372     }
15373     if (IncomingInputs.empty()) {
15374       // Just fix all of the in place inputs.
15375       for (int Input : InPlaceInputs) {
15376         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15377         PSHUFDMask[Input / 2] = Input / 2;
15378       }
15379       return;
15380     }
15381 
15382     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
15383     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15384         InPlaceInputs[0] - HalfOffset;
15385     // Put the second input next to the first so that they are packed into
15386     // a dword. We find the adjacent index by toggling the low bit.
15387     int AdjIndex = InPlaceInputs[0] ^ 1;
15388     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15389     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15390     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15391   };
15392   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15393   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15394 
15395   // Now gather the cross-half inputs and place them into a free dword of
15396   // their target half.
15397   // FIXME: This operation could almost certainly be simplified dramatically to
15398   // look more like the 3-1 fixing operation.
15399   auto moveInputsToRightHalf = [&PSHUFDMask](
15400       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15401       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15402       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15403       int DestOffset) {
15404     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15405       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15406     };
15407     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15408                                                int Word) {
15409       int LowWord = Word & ~1;
15410       int HighWord = Word | 1;
15411       return isWordClobbered(SourceHalfMask, LowWord) ||
15412              isWordClobbered(SourceHalfMask, HighWord);
15413     };
15414 
15415     if (IncomingInputs.empty())
15416       return;
15417 
15418     if (ExistingInputs.empty()) {
15419       // Map any dwords with inputs from them into the right half.
15420       for (int Input : IncomingInputs) {
15421         // If the source half mask maps over the inputs, turn those into
15422         // swaps and use the swapped lane.
15423         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
15424           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
15425             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
15426                 Input - SourceOffset;
15427             // We have to swap the uses in our half mask in one sweep.
15428             for (int &M : HalfMask)
15429               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
15430                 M = Input;
15431               else if (M == Input)
15432                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15433           } else {
15434             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
15435                        Input - SourceOffset &&
15436                    "Previous placement doesn't match!");
15437           }
15438           // Note that this correctly re-maps both when we do a swap and when
15439           // we observe the other side of the swap above. We rely on that to
15440           // avoid swapping the members of the input list directly.
15441           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15442         }
15443 
15444         // Map the input's dword into the correct half.
15445         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
15446           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
15447         else
15448           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
15449                      Input / 2 &&
15450                  "Previous placement doesn't match!");
15451       }
15452 
15453       // And just directly shift any other-half mask elements to be same-half
15454       // as we will have mirrored the dword containing the element into the
15455       // same position within that half.
15456       for (int &M : HalfMask)
15457         if (M >= SourceOffset && M < SourceOffset + 4) {
15458           M = M - SourceOffset + DestOffset;
15459           assert(M >= 0 && "This should never wrap below zero!");
15460         }
15461       return;
15462     }
15463 
15464     // Ensure we have the input in a viable dword of its current half. This
15465     // is particularly tricky because the original position may be clobbered
15466     // by inputs being moved and *staying* in that half.
15467     if (IncomingInputs.size() == 1) {
15468       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15469         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
15470                          SourceOffset;
15471         SourceHalfMask[InputFixed - SourceOffset] =
15472             IncomingInputs[0] - SourceOffset;
15473         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
15474                      InputFixed);
15475         IncomingInputs[0] = InputFixed;
15476       }
15477     } else if (IncomingInputs.size() == 2) {
15478       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
15479           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15480         // We have two non-adjacent or clobbered inputs we need to extract from
15481         // the source half. To do this, we need to map them into some adjacent
15482         // dword slot in the source mask.
15483         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
15484                               IncomingInputs[1] - SourceOffset};
15485 
15486         // If there is a free slot in the source half mask adjacent to one of
15487         // the inputs, place the other input in it. We use (Index XOR 1) to
15488         // compute an adjacent index.
15489         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
15490             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
15491           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
15492           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15493           InputsFixed[1] = InputsFixed[0] ^ 1;
15494         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
15495                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
15496           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
15497           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
15498           InputsFixed[0] = InputsFixed[1] ^ 1;
15499         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
15500                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
15501           // The two inputs are in the same DWord but it is clobbered and the
15502           // adjacent DWord isn't used at all. Move both inputs to the free
15503           // slot.
15504           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
15505           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
15506           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
15507           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
15508         } else {
15509           // The only way we hit this point is if there is no clobbering
15510           // (because there are no off-half inputs to this half) and there is no
15511           // free slot adjacent to one of the inputs. In this case, we have to
15512           // swap an input with a non-input.
15513           for (int i = 0; i < 4; ++i)
15514             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
15515                    "We can't handle any clobbers here!");
15516           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
15517                  "Cannot have adjacent inputs here!");
15518 
15519           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15520           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15521 
15522           // We also have to update the final source mask in this case because
15523           // it may need to undo the above swap.
15524           for (int &M : FinalSourceHalfMask)
15525             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15526               M = InputsFixed[1] + SourceOffset;
15527             else if (M == InputsFixed[1] + SourceOffset)
15528               M = (InputsFixed[0] ^ 1) + SourceOffset;
15529 
15530           InputsFixed[1] = InputsFixed[0] ^ 1;
15531         }
15532 
15533         // Point everything at the fixed inputs.
15534         for (int &M : HalfMask)
15535           if (M == IncomingInputs[0])
15536             M = InputsFixed[0] + SourceOffset;
15537           else if (M == IncomingInputs[1])
15538             M = InputsFixed[1] + SourceOffset;
15539 
15540         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15541         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15542       }
15543     } else {
15544       llvm_unreachable("Unhandled input size!");
15545     }
15546 
15547     // Now hoist the DWord down to the right half.
15548     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15549     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
15550     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15551     for (int &M : HalfMask)
15552       for (int Input : IncomingInputs)
15553         if (M == Input)
15554           M = FreeDWord * 2 + Input % 2;
15555   };
15556   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15557                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
15558   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15559                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
15560 
15561   // Now enact all the shuffles we've computed to move the inputs into their
15562   // target half.
15563   if (!isNoopShuffleMask(PSHUFLMask))
15564     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15565                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15566   if (!isNoopShuffleMask(PSHUFHMask))
15567     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15568                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15569   if (!isNoopShuffleMask(PSHUFDMask))
15570     V = DAG.getBitcast(
15571         VT,
15572         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15573                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15574 
15575   // At this point, each half should contain all its inputs, and we can then
15576   // just shuffle them into their final position.
15577   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
15578          "Failed to lift all the high half inputs to the low mask!");
15579   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
15580          "Failed to lift all the low half inputs to the high mask!");
15581 
15582   // Do a half shuffle for the low mask.
15583   if (!isNoopShuffleMask(LoMask))
15584     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15585                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15586 
15587   // Do a half shuffle with the high mask after shifting its values down.
15588   for (int &M : HiMask)
15589     if (M >= 0)
15590       M -= 4;
15591   if (!isNoopShuffleMask(HiMask))
15592     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15593                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15594 
15595   return V;
15596 }
15597 
15598 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15599 /// blend if only one input is used.
15600 static SDValue lowerShuffleAsBlendOfPSHUFBs(
15601     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15602     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15603   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
15604          "Lane crossing shuffle masks not supported");
15605 
15606   int NumBytes = VT.getSizeInBits() / 8;
15607   int Size = Mask.size();
15608   int Scale = NumBytes / Size;
15609 
15610   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15611   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15612   V1InUse = false;
15613   V2InUse = false;
15614 
15615   for (int i = 0; i < NumBytes; ++i) {
15616     int M = Mask[i / Scale];
15617     if (M < 0)
15618       continue;
15619 
15620     const int ZeroMask = 0x80;
15621     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15622     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15623     if (Zeroable[i / Scale])
15624       V1Idx = V2Idx = ZeroMask;
15625 
15626     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15627     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15628     V1InUse |= (ZeroMask != V1Idx);
15629     V2InUse |= (ZeroMask != V2Idx);
15630   }
15631 
15632   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15633   if (V1InUse)
15634     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15635                      DAG.getBuildVector(ShufVT, DL, V1Mask));
15636   if (V2InUse)
15637     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15638                      DAG.getBuildVector(ShufVT, DL, V2Mask));
15639 
15640   // If we need shuffled inputs from both, blend the two.
15641   SDValue V;
15642   if (V1InUse && V2InUse)
15643     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15644   else
15645     V = V1InUse ? V1 : V2;
15646 
15647   // Cast the result back to the correct type.
15648   return DAG.getBitcast(VT, V);
15649 }
15650 
15651 /// Generic lowering of 8-lane i16 shuffles.
15652 ///
15653 /// This handles both single-input shuffles and combined shuffle/blends with
15654 /// two inputs. The single input shuffles are immediately delegated to
15655 /// a dedicated lowering routine.
15656 ///
15657 /// The blends are lowered in one of three fundamental ways. If there are few
15658 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15659 /// of the input is significantly cheaper when lowered as an interleaving of
15660 /// the two inputs, try to interleave them. Otherwise, blend the low and high
15661 /// halves of the inputs separately (making them have relatively few inputs)
15662 /// and then concatenate them.
15663 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15664                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15665                                  const X86Subtarget &Subtarget,
15666                                  SelectionDAG &DAG) {
15667   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15668   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15669   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15670 
15671   // Whenever we can lower this as a zext, that instruction is strictly faster
15672   // than any alternative.
15673   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15674                                                    Zeroable, Subtarget, DAG))
15675     return ZExt;
15676 
15677   // Try to use lower using a truncation.
15678   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15679                                         Subtarget, DAG))
15680     return V;
15681 
15682   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15683 
15684   if (NumV2Inputs == 0) {
15685     // Try to use shift instructions.
15686     if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15687                                             Zeroable, Subtarget, DAG))
15688       return Shift;
15689 
15690     // Check for being able to broadcast a single element.
15691     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15692                                                     Mask, Subtarget, DAG))
15693       return Broadcast;
15694 
15695     // Try to use bit rotation instructions.
15696     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15697                                                  Subtarget, DAG))
15698       return Rotate;
15699 
15700     // Use dedicated unpack instructions for masks that match their pattern.
15701     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15702       return V;
15703 
15704     // Use dedicated pack instructions for masks that match their pattern.
15705     if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15706                                          Subtarget))
15707       return V;
15708 
15709     // Try to use byte rotation instructions.
15710     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15711                                                   Subtarget, DAG))
15712       return Rotate;
15713 
15714     // Make a copy of the mask so it can be modified.
15715     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15716     return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15717                                                Subtarget, DAG);
15718   }
15719 
15720   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15721          "All single-input shuffles should be canonicalized to be V1-input "
15722          "shuffles.");
15723 
15724   // Try to use shift instructions.
15725   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15726                                           Zeroable, Subtarget, DAG))
15727     return Shift;
15728 
15729   // See if we can use SSE4A Extraction / Insertion.
15730   if (Subtarget.hasSSE4A())
15731     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15732                                           Zeroable, DAG))
15733       return V;
15734 
15735   // There are special ways we can lower some single-element blends.
15736   if (NumV2Inputs == 1)
15737     if (SDValue V = lowerShuffleAsElementInsertion(
15738             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15739       return V;
15740 
15741   // We have different paths for blend lowering, but they all must use the
15742   // *exact* same predicate.
15743   bool IsBlendSupported = Subtarget.hasSSE41();
15744   if (IsBlendSupported)
15745     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15746                                             Zeroable, Subtarget, DAG))
15747       return Blend;
15748 
15749   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15750                                              Zeroable, Subtarget, DAG))
15751     return Masked;
15752 
15753   // Use dedicated unpack instructions for masks that match their pattern.
15754   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15755     return V;
15756 
15757   // Use dedicated pack instructions for masks that match their pattern.
15758   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15759                                        Subtarget))
15760     return V;
15761 
15762   // Try to use lower using a truncation.
15763   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15764                                        Subtarget, DAG))
15765     return V;
15766 
15767   // Try to use byte rotation instructions.
15768   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15769                                                 Subtarget, DAG))
15770     return Rotate;
15771 
15772   if (SDValue BitBlend =
15773           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15774     return BitBlend;
15775 
15776   // Try to use byte shift instructions to mask.
15777   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15778                                               Zeroable, Subtarget, DAG))
15779     return V;
15780 
15781   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15782   // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15783   // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15784   int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
15785   if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15786       !Subtarget.hasVLX()) {
15787     // Check if this is part of a 256-bit vector truncation.
15788     if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
15789         peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15790         peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
15791       SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
15792       V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
15793                          getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
15794                          DAG.getTargetConstant(0xEE, DL, MVT::i8));
15795       V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
15796       V1 = extract128BitVector(V1V2, 0, DAG, DL);
15797       V2 = extract128BitVector(V1V2, 4, DAG, DL);
15798     } else {
15799       SmallVector<SDValue, 4> DWordClearOps(4,
15800                                             DAG.getConstant(0, DL, MVT::i32));
15801       for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15802         DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15803       SDValue DWordClearMask =
15804           DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15805       V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15806                        DWordClearMask);
15807       V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15808                        DWordClearMask);
15809     }
15810     // Now pack things back together.
15811     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15812     if (NumEvenDrops == 2) {
15813       Result = DAG.getBitcast(MVT::v4i32, Result);
15814       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15815     }
15816     return Result;
15817   }
15818 
15819   // When compacting odd (upper) elements, use PACKSS pre-SSE41.
15820   int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
15821   if (NumOddDrops == 1) {
15822     bool HasSSE41 = Subtarget.hasSSE41();
15823     V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15824                      DAG.getBitcast(MVT::v4i32, V1),
15825                      DAG.getTargetConstant(16, DL, MVT::i8));
15826     V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15827                      DAG.getBitcast(MVT::v4i32, V2),
15828                      DAG.getTargetConstant(16, DL, MVT::i8));
15829     return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
15830                        MVT::v8i16, V1, V2);
15831   }
15832 
15833   // Try to lower by permuting the inputs into an unpack instruction.
15834   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15835                                                       Mask, Subtarget, DAG))
15836     return Unpack;
15837 
15838   // If we can't directly blend but can use PSHUFB, that will be better as it
15839   // can both shuffle and set up the inefficient blend.
15840   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15841     bool V1InUse, V2InUse;
15842     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15843                                         Zeroable, DAG, V1InUse, V2InUse);
15844   }
15845 
15846   // We can always bit-blend if we have to so the fallback strategy is to
15847   // decompose into single-input permutes and blends/unpacks.
15848   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15849                                               Mask, Subtarget, DAG);
15850 }
15851 
15852 /// Lower 8-lane 16-bit floating point shuffles.
15853 static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15854                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15855                                  const X86Subtarget &Subtarget,
15856                                  SelectionDAG &DAG) {
15857   assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15858   assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15859   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15860   int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15861 
15862   if (NumV2Elements == 0) {
15863     // Check for being able to broadcast a single element.
15864     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15865                                                     Mask, Subtarget, DAG))
15866       return Broadcast;
15867   }
15868   if (NumV2Elements == 1 && Mask[0] >= 8)
15869     if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
15870                                                    Zeroable, Subtarget, DAG))
15871       return V;
15872 
15873   V1 = DAG.getBitcast(MVT::v8i16, V1);
15874   V2 = DAG.getBitcast(MVT::v8i16, V2);
15875   return DAG.getBitcast(MVT::v8f16,
15876                         DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15877 }
15878 
15879 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15880 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15881 // the active subvector is extracted.
15882 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15883                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
15884                                      const X86Subtarget &Subtarget,
15885                                      SelectionDAG &DAG) {
15886   MVT MaskVT = VT.changeTypeToInteger();
15887   SDValue MaskNode;
15888   MVT ShuffleVT = VT;
15889   if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15890     V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15891     V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15892     ShuffleVT = V1.getSimpleValueType();
15893 
15894     // Adjust mask to correct indices for the second input.
15895     int NumElts = VT.getVectorNumElements();
15896     unsigned Scale = 512 / VT.getSizeInBits();
15897     SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15898     for (int &M : AdjustedMask)
15899       if (NumElts <= M)
15900         M += (Scale - 1) * NumElts;
15901     MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15902     MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15903   } else {
15904     MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15905   }
15906 
15907   SDValue Result;
15908   if (V2.isUndef())
15909     Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15910   else
15911     Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15912 
15913   if (VT != ShuffleVT)
15914     Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15915 
15916   return Result;
15917 }
15918 
15919 /// Generic lowering of v16i8 shuffles.
15920 ///
15921 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15922 /// detect any complexity reducing interleaving. If that doesn't help, it uses
15923 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15924 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15925 /// back together.
15926 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15927                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15928                                  const X86Subtarget &Subtarget,
15929                                  SelectionDAG &DAG) {
15930   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15931   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15932   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15933 
15934   // Try to use shift instructions.
15935   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15936                                           Zeroable, Subtarget, DAG))
15937     return Shift;
15938 
15939   // Try to use byte rotation instructions.
15940   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15941                                                 Subtarget, DAG))
15942     return Rotate;
15943 
15944   // Use dedicated pack instructions for masks that match their pattern.
15945   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15946                                        Subtarget))
15947     return V;
15948 
15949   // Try to use a zext lowering.
15950   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15951                                                    Zeroable, Subtarget, DAG))
15952     return ZExt;
15953 
15954   // Try to use lower using a truncation.
15955   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15956                                         Subtarget, DAG))
15957     return V;
15958 
15959   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15960                                        Subtarget, DAG))
15961     return V;
15962 
15963   // See if we can use SSE4A Extraction / Insertion.
15964   if (Subtarget.hasSSE4A())
15965     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15966                                           Zeroable, DAG))
15967       return V;
15968 
15969   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15970 
15971   // For single-input shuffles, there are some nicer lowering tricks we can use.
15972   if (NumV2Elements == 0) {
15973     // Check for being able to broadcast a single element.
15974     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15975                                                     Mask, Subtarget, DAG))
15976       return Broadcast;
15977 
15978     // Try to use bit rotation instructions.
15979     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15980                                                  Subtarget, DAG))
15981       return Rotate;
15982 
15983     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15984       return V;
15985 
15986     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15987     // Notably, this handles splat and partial-splat shuffles more efficiently.
15988     // However, it only makes sense if the pre-duplication shuffle simplifies
15989     // things significantly. Currently, this means we need to be able to
15990     // express the pre-duplication shuffle as an i16 shuffle.
15991     //
15992     // FIXME: We should check for other patterns which can be widened into an
15993     // i16 shuffle as well.
15994     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15995       for (int i = 0; i < 16; i += 2)
15996         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15997           return false;
15998 
15999       return true;
16000     };
16001     auto tryToWidenViaDuplication = [&]() -> SDValue {
16002       if (!canWidenViaDuplication(Mask))
16003         return SDValue();
16004       SmallVector<int, 4> LoInputs;
16005       copy_if(Mask, std::back_inserter(LoInputs),
16006               [](int M) { return M >= 0 && M < 8; });
16007       array_pod_sort(LoInputs.begin(), LoInputs.end());
16008       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16009                      LoInputs.end());
16010       SmallVector<int, 4> HiInputs;
16011       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16012       array_pod_sort(HiInputs.begin(), HiInputs.end());
16013       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16014                      HiInputs.end());
16015 
16016       bool TargetLo = LoInputs.size() >= HiInputs.size();
16017       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16018       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16019 
16020       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16021       SmallDenseMap<int, int, 8> LaneMap;
16022       for (int I : InPlaceInputs) {
16023         PreDupI16Shuffle[I/2] = I/2;
16024         LaneMap[I] = I;
16025       }
16026       int j = TargetLo ? 0 : 4, je = j + 4;
16027       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16028         // Check if j is already a shuffle of this input. This happens when
16029         // there are two adjacent bytes after we move the low one.
16030         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16031           // If we haven't yet mapped the input, search for a slot into which
16032           // we can map it.
16033           while (j < je && PreDupI16Shuffle[j] >= 0)
16034             ++j;
16035 
16036           if (j == je)
16037             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16038             return SDValue();
16039 
16040           // Map this input with the i16 shuffle.
16041           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16042         }
16043 
16044         // Update the lane map based on the mapping we ended up with.
16045         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16046       }
16047       V1 = DAG.getBitcast(
16048           MVT::v16i8,
16049           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16050                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16051 
16052       // Unpack the bytes to form the i16s that will be shuffled into place.
16053       bool EvenInUse = false, OddInUse = false;
16054       for (int i = 0; i < 16; i += 2) {
16055         EvenInUse |= (Mask[i + 0] >= 0);
16056         OddInUse |= (Mask[i + 1] >= 0);
16057         if (EvenInUse && OddInUse)
16058           break;
16059       }
16060       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16061                        MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16062                        OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16063 
16064       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16065       for (int i = 0; i < 16; ++i)
16066         if (Mask[i] >= 0) {
16067           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16068           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
16069           if (PostDupI16Shuffle[i / 2] < 0)
16070             PostDupI16Shuffle[i / 2] = MappedMask;
16071           else
16072             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
16073                    "Conflicting entries in the original shuffle!");
16074         }
16075       return DAG.getBitcast(
16076           MVT::v16i8,
16077           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16078                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16079     };
16080     if (SDValue V = tryToWidenViaDuplication())
16081       return V;
16082   }
16083 
16084   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16085                                              Zeroable, Subtarget, DAG))
16086     return Masked;
16087 
16088   // Use dedicated unpack instructions for masks that match their pattern.
16089   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16090     return V;
16091 
16092   // Try to use byte shift instructions to mask.
16093   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16094                                               Zeroable, Subtarget, DAG))
16095     return V;
16096 
16097   // Check for compaction patterns.
16098   bool IsSingleInput = V2.isUndef();
16099   int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16100 
16101   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16102   // with PSHUFB. It is important to do this before we attempt to generate any
16103   // blends but after all of the single-input lowerings. If the single input
16104   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16105   // want to preserve that and we can DAG combine any longer sequences into
16106   // a PSHUFB in the end. But once we start blending from multiple inputs,
16107   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16108   // and there are *very* few patterns that would actually be faster than the
16109   // PSHUFB approach because of its ability to zero lanes.
16110   //
16111   // If the mask is a binary compaction, we can more efficiently perform this
16112   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16113   //
16114   // FIXME: The only exceptions to the above are blends which are exact
16115   // interleavings with direct instructions supporting them. We currently don't
16116   // handle those well here.
16117   if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16118     bool V1InUse = false;
16119     bool V2InUse = false;
16120 
16121     SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16122         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16123 
16124     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16125     // do so. This avoids using them to handle blends-with-zero which is
16126     // important as a single pshufb is significantly faster for that.
16127     if (V1InUse && V2InUse) {
16128       if (Subtarget.hasSSE41())
16129         if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16130                                                 Zeroable, Subtarget, DAG))
16131           return Blend;
16132 
16133       // We can use an unpack to do the blending rather than an or in some
16134       // cases. Even though the or may be (very minorly) more efficient, we
16135       // preference this lowering because there are common cases where part of
16136       // the complexity of the shuffles goes away when we do the final blend as
16137       // an unpack.
16138       // FIXME: It might be worth trying to detect if the unpack-feeding
16139       // shuffles will both be pshufb, in which case we shouldn't bother with
16140       // this.
16141       if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16142               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16143         return Unpack;
16144 
16145       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16146       if (Subtarget.hasVBMI())
16147         return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16148                                      DAG);
16149 
16150       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16151       if (Subtarget.hasXOP()) {
16152         SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16153         return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16154       }
16155 
16156       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16157       // PALIGNR will be cheaper than the second PSHUFB+OR.
16158       if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16159               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16160         return V;
16161     }
16162 
16163     return PSHUFB;
16164   }
16165 
16166   // There are special ways we can lower some single-element blends.
16167   if (NumV2Elements == 1)
16168     if (SDValue V = lowerShuffleAsElementInsertion(
16169             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16170       return V;
16171 
16172   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16173     return Blend;
16174 
16175   // Check whether a compaction lowering can be done. This handles shuffles
16176   // which take every Nth element for some even N. See the helper function for
16177   // details.
16178   //
16179   // We special case these as they can be particularly efficiently handled with
16180   // the PACKUSB instruction on x86 and they show up in common patterns of
16181   // rearranging bytes to truncate wide elements.
16182   if (NumEvenDrops) {
16183     // NumEvenDrops is the power of two stride of the elements. Another way of
16184     // thinking about it is that we need to drop the even elements this many
16185     // times to get the original input.
16186 
16187     // First we need to zero all the dropped bytes.
16188     assert(NumEvenDrops <= 3 &&
16189            "No support for dropping even elements more than 3 times.");
16190     SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16191     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16192       WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16193     SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16194     V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16195                      WordClearMask);
16196     if (!IsSingleInput)
16197       V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16198                        WordClearMask);
16199 
16200     // Now pack things back together.
16201     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16202                                  IsSingleInput ? V1 : V2);
16203     for (int i = 1; i < NumEvenDrops; ++i) {
16204       Result = DAG.getBitcast(MVT::v8i16, Result);
16205       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16206     }
16207     return Result;
16208   }
16209 
16210   int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16211   if (NumOddDrops == 1) {
16212     V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16213                      DAG.getBitcast(MVT::v8i16, V1),
16214                      DAG.getTargetConstant(8, DL, MVT::i8));
16215     if (!IsSingleInput)
16216       V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16217                        DAG.getBitcast(MVT::v8i16, V2),
16218                        DAG.getTargetConstant(8, DL, MVT::i8));
16219     return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16220                        IsSingleInput ? V1 : V2);
16221   }
16222 
16223   // Handle multi-input cases by blending/unpacking single-input shuffles.
16224   if (NumV2Elements > 0)
16225     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16226                                                 Subtarget, DAG);
16227 
16228   // The fallback path for single-input shuffles widens this into two v8i16
16229   // vectors with unpacks, shuffles those, and then pulls them back together
16230   // with a pack.
16231   SDValue V = V1;
16232 
16233   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16234   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16235   for (int i = 0; i < 16; ++i)
16236     if (Mask[i] >= 0)
16237       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16238 
16239   SDValue VLoHalf, VHiHalf;
16240   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16241   // them out and avoid using UNPCK{L,H} to extract the elements of V as
16242   // i16s.
16243   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16244       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16245     // Use a mask to drop the high bytes.
16246     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16247     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16248                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
16249 
16250     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16251     VHiHalf = DAG.getUNDEF(MVT::v8i16);
16252 
16253     // Squash the masks to point directly into VLoHalf.
16254     for (int &M : LoBlendMask)
16255       if (M >= 0)
16256         M /= 2;
16257     for (int &M : HiBlendMask)
16258       if (M >= 0)
16259         M /= 2;
16260   } else {
16261     // Otherwise just unpack the low half of V into VLoHalf and the high half into
16262     // VHiHalf so that we can blend them as i16s.
16263     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16264 
16265     VLoHalf = DAG.getBitcast(
16266         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16267     VHiHalf = DAG.getBitcast(
16268         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16269   }
16270 
16271   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16272   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16273 
16274   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16275 }
16276 
16277 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
16278 ///
16279 /// This routine breaks down the specific type of 128-bit shuffle and
16280 /// dispatches to the lowering routines accordingly.
16281 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16282                                   MVT VT, SDValue V1, SDValue V2,
16283                                   const APInt &Zeroable,
16284                                   const X86Subtarget &Subtarget,
16285                                   SelectionDAG &DAG) {
16286   switch (VT.SimpleTy) {
16287   case MVT::v2i64:
16288     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16289   case MVT::v2f64:
16290     return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16291   case MVT::v4i32:
16292     return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16293   case MVT::v4f32:
16294     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16295   case MVT::v8i16:
16296     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16297   case MVT::v8f16:
16298     return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16299   case MVT::v16i8:
16300     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16301 
16302   default:
16303     llvm_unreachable("Unimplemented!");
16304   }
16305 }
16306 
16307 /// Generic routine to split vector shuffle into half-sized shuffles.
16308 ///
16309 /// This routine just extracts two subvectors, shuffles them independently, and
16310 /// then concatenates them back together. This should work effectively with all
16311 /// AVX vector shuffle types.
16312 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16313                                     SDValue V2, ArrayRef<int> Mask,
16314                                     SelectionDAG &DAG) {
16315   assert(VT.getSizeInBits() >= 256 &&
16316          "Only for 256-bit or wider vector shuffles!");
16317   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
16318   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
16319 
16320   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16321   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16322 
16323   int NumElements = VT.getVectorNumElements();
16324   int SplitNumElements = NumElements / 2;
16325   MVT ScalarVT = VT.getVectorElementType();
16326   MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16327 
16328   // Use splitVector/extractSubVector so that split build-vectors just build two
16329   // narrower build vectors. This helps shuffling with splats and zeros.
16330   auto SplitVector = [&](SDValue V) {
16331     SDValue LoV, HiV;
16332     std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16333     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16334                           DAG.getBitcast(SplitVT, HiV));
16335   };
16336 
16337   SDValue LoV1, HiV1, LoV2, HiV2;
16338   std::tie(LoV1, HiV1) = SplitVector(V1);
16339   std::tie(LoV2, HiV2) = SplitVector(V2);
16340 
16341   // Now create two 4-way blends of these half-width vectors.
16342   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16343     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16344     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16345     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16346     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16347     for (int i = 0; i < SplitNumElements; ++i) {
16348       int M = HalfMask[i];
16349       if (M >= NumElements) {
16350         if (M >= NumElements + SplitNumElements)
16351           UseHiV2 = true;
16352         else
16353           UseLoV2 = true;
16354         V2BlendMask[i] = M - NumElements;
16355         BlendMask[i] = SplitNumElements + i;
16356       } else if (M >= 0) {
16357         if (M >= SplitNumElements)
16358           UseHiV1 = true;
16359         else
16360           UseLoV1 = true;
16361         V1BlendMask[i] = M;
16362         BlendMask[i] = i;
16363       }
16364     }
16365 
16366     // Because the lowering happens after all combining takes place, we need to
16367     // manually combine these blend masks as much as possible so that we create
16368     // a minimal number of high-level vector shuffle nodes.
16369 
16370     // First try just blending the halves of V1 or V2.
16371     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16372       return DAG.getUNDEF(SplitVT);
16373     if (!UseLoV2 && !UseHiV2)
16374       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16375     if (!UseLoV1 && !UseHiV1)
16376       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16377 
16378     SDValue V1Blend, V2Blend;
16379     if (UseLoV1 && UseHiV1) {
16380       V1Blend =
16381         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16382     } else {
16383       // We only use half of V1 so map the usage down into the final blend mask.
16384       V1Blend = UseLoV1 ? LoV1 : HiV1;
16385       for (int i = 0; i < SplitNumElements; ++i)
16386         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16387           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16388     }
16389     if (UseLoV2 && UseHiV2) {
16390       V2Blend =
16391         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16392     } else {
16393       // We only use half of V2 so map the usage down into the final blend mask.
16394       V2Blend = UseLoV2 ? LoV2 : HiV2;
16395       for (int i = 0; i < SplitNumElements; ++i)
16396         if (BlendMask[i] >= SplitNumElements)
16397           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
16398     }
16399     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16400   };
16401   SDValue Lo = HalfBlend(LoMask);
16402   SDValue Hi = HalfBlend(HiMask);
16403   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16404 }
16405 
16406 /// Either split a vector in halves or decompose the shuffles and the
16407 /// blend/unpack.
16408 ///
16409 /// This is provided as a good fallback for many lowerings of non-single-input
16410 /// shuffles with more than one 128-bit lane. In those cases, we want to select
16411 /// between splitting the shuffle into 128-bit components and stitching those
16412 /// back together vs. extracting the single-input shuffles and blending those
16413 /// results.
16414 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
16415                                           SDValue V2, ArrayRef<int> Mask,
16416                                           const X86Subtarget &Subtarget,
16417                                           SelectionDAG &DAG) {
16418   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
16419          "shuffles as it could then recurse on itself.");
16420   int Size = Mask.size();
16421 
16422   // If this can be modeled as a broadcast of two elements followed by a blend,
16423   // prefer that lowering. This is especially important because broadcasts can
16424   // often fold with memory operands.
16425   auto DoBothBroadcast = [&] {
16426     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
16427     for (int M : Mask)
16428       if (M >= Size) {
16429         if (V2BroadcastIdx < 0)
16430           V2BroadcastIdx = M - Size;
16431         else if (M - Size != V2BroadcastIdx)
16432           return false;
16433       } else if (M >= 0) {
16434         if (V1BroadcastIdx < 0)
16435           V1BroadcastIdx = M;
16436         else if (M != V1BroadcastIdx)
16437           return false;
16438       }
16439     return true;
16440   };
16441   if (DoBothBroadcast())
16442     return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16443                                                 DAG);
16444 
16445   // If the inputs all stem from a single 128-bit lane of each input, then we
16446   // split them rather than blending because the split will decompose to
16447   // unusually few instructions.
16448   int LaneCount = VT.getSizeInBits() / 128;
16449   int LaneSize = Size / LaneCount;
16450   SmallBitVector LaneInputs[2];
16451   LaneInputs[0].resize(LaneCount, false);
16452   LaneInputs[1].resize(LaneCount, false);
16453   for (int i = 0; i < Size; ++i)
16454     if (Mask[i] >= 0)
16455       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
16456   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
16457     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16458 
16459   // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
16460   // requires that the decomposed single-input shuffles don't end up here.
16461   return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16462                                               DAG);
16463 }
16464 
16465 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16466 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
16467 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
16468                                                  SDValue V1, SDValue V2,
16469                                                  ArrayRef<int> Mask,
16470                                                  SelectionDAG &DAG) {
16471   assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
16472 
16473   int LHSMask[4] = {-1, -1, -1, -1};
16474   int RHSMask[4] = {-1, -1, -1, -1};
16475   unsigned SHUFPMask = 0;
16476 
16477   // As SHUFPD uses a single LHS/RHS element per lane, we can always
16478   // perform the shuffle once the lanes have been shuffled in place.
16479   for (int i = 0; i != 4; ++i) {
16480     int M = Mask[i];
16481     if (M < 0)
16482       continue;
16483     int LaneBase = i & ~1;
16484     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
16485     LaneMask[LaneBase + (M & 1)] = M;
16486     SHUFPMask |= (M & 1) << i;
16487   }
16488 
16489   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
16490   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
16491   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
16492                      DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
16493 }
16494 
16495 /// Lower a vector shuffle crossing multiple 128-bit lanes as
16496 /// a lane permutation followed by a per-lane permutation.
16497 ///
16498 /// This is mainly for cases where we can have non-repeating permutes
16499 /// in each lane.
16500 ///
16501 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
16502 /// we should investigate merging them.
16503 static SDValue lowerShuffleAsLanePermuteAndPermute(
16504     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16505     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16506   int NumElts = VT.getVectorNumElements();
16507   int NumLanes = VT.getSizeInBits() / 128;
16508   int NumEltsPerLane = NumElts / NumLanes;
16509   bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
16510 
16511   /// Attempts to find a sublane permute with the given size
16512   /// that gets all elements into their target lanes.
16513   ///
16514   /// If successful, fills CrossLaneMask and InLaneMask and returns true.
16515   /// If unsuccessful, returns false and may overwrite InLaneMask.
16516   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
16517     int NumSublanesPerLane = NumSublanes / NumLanes;
16518     int NumEltsPerSublane = NumElts / NumSublanes;
16519 
16520     SmallVector<int, 16> CrossLaneMask;
16521     SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
16522     // CrossLaneMask but one entry == one sublane.
16523     SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
16524 
16525     for (int i = 0; i != NumElts; ++i) {
16526       int M = Mask[i];
16527       if (M < 0)
16528         continue;
16529 
16530       int SrcSublane = M / NumEltsPerSublane;
16531       int DstLane = i / NumEltsPerLane;
16532 
16533       // We only need to get the elements into the right lane, not sublane.
16534       // So search all sublanes that make up the destination lane.
16535       bool Found = false;
16536       int DstSubStart = DstLane * NumSublanesPerLane;
16537       int DstSubEnd = DstSubStart + NumSublanesPerLane;
16538       for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16539         if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16540           continue;
16541 
16542         Found = true;
16543         CrossLaneMaskLarge[DstSublane] = SrcSublane;
16544         int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16545         InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16546         break;
16547       }
16548       if (!Found)
16549         return SDValue();
16550     }
16551 
16552     // Fill CrossLaneMask using CrossLaneMaskLarge.
16553     narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
16554 
16555     if (!CanUseSublanes) {
16556       // If we're only shuffling a single lowest lane and the rest are identity
16557       // then don't bother.
16558       // TODO - isShuffleMaskInputInPlace could be extended to something like
16559       // this.
16560       int NumIdentityLanes = 0;
16561       bool OnlyShuffleLowestLane = true;
16562       for (int i = 0; i != NumLanes; ++i) {
16563         int LaneOffset = i * NumEltsPerLane;
16564         if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
16565                                        i * NumEltsPerLane))
16566           NumIdentityLanes++;
16567         else if (CrossLaneMask[LaneOffset] != 0)
16568           OnlyShuffleLowestLane = false;
16569       }
16570       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16571         return SDValue();
16572     }
16573 
16574     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16575     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16576                                 InLaneMask);
16577   };
16578 
16579   // First attempt a solution with full lanes.
16580   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16581     return V;
16582 
16583   // The rest of the solutions use sublanes.
16584   if (!CanUseSublanes)
16585     return SDValue();
16586 
16587   // Then attempt a solution with 64-bit sublanes (vpermq).
16588   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16589     return V;
16590 
16591   // If that doesn't work and we have fast variable cross-lane shuffle,
16592   // attempt 32-bit sublanes (vpermd).
16593   if (!Subtarget.hasFastVariableCrossLaneShuffle())
16594     return SDValue();
16595 
16596   return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16597 }
16598 
16599 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16600 /// source with a lane permutation.
16601 ///
16602 /// This lowering strategy results in four instructions in the worst case for a
16603 /// single-input cross lane shuffle which is lower than any other fully general
16604 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16605 /// shuffle pattern should be handled prior to trying this lowering.
16606 static SDValue lowerShuffleAsLanePermuteAndShuffle(
16607     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16608     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16609   // FIXME: This should probably be generalized for 512-bit vectors as well.
16610   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
16611   int Size = Mask.size();
16612   int LaneSize = Size / 2;
16613 
16614   // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16615   // Only do this if the elements aren't all from the lower lane,
16616   // otherwise we're (probably) better off doing a split.
16617   if (VT == MVT::v4f64 &&
16618       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16619     if (SDValue V =
16620             lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16621       return V;
16622 
16623   // If there are only inputs from one 128-bit lane, splitting will in fact be
16624   // less expensive. The flags track whether the given lane contains an element
16625   // that crosses to another lane.
16626   bool AllLanes;
16627   if (!Subtarget.hasAVX2()) {
16628     bool LaneCrossing[2] = {false, false};
16629     for (int i = 0; i < Size; ++i)
16630       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16631         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16632     AllLanes = LaneCrossing[0] && LaneCrossing[1];
16633   } else {
16634     bool LaneUsed[2] = {false, false};
16635     for (int i = 0; i < Size; ++i)
16636       if (Mask[i] >= 0)
16637         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16638     AllLanes = LaneUsed[0] && LaneUsed[1];
16639   }
16640 
16641   // TODO - we could support shuffling V2 in the Flipped input.
16642   assert(V2.isUndef() &&
16643          "This last part of this routine only works on single input shuffles");
16644 
16645   SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16646   for (int i = 0; i < Size; ++i) {
16647     int &M = InLaneMask[i];
16648     if (M < 0)
16649       continue;
16650     if (((M % Size) / LaneSize) != (i / LaneSize))
16651       M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16652   }
16653   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16654          "In-lane shuffle mask expected");
16655 
16656   // If we're not using both lanes in each lane and the inlane mask is not
16657   // repeating, then we're better off splitting.
16658   if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
16659     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16660 
16661   // Flip the lanes, and shuffle the results which should now be in-lane.
16662   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16663   SDValue Flipped = DAG.getBitcast(PVT, V1);
16664   Flipped =
16665       DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16666   Flipped = DAG.getBitcast(VT, Flipped);
16667   return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16668 }
16669 
16670 /// Handle lowering 2-lane 128-bit shuffles.
16671 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16672                                   SDValue V2, ArrayRef<int> Mask,
16673                                   const APInt &Zeroable,
16674                                   const X86Subtarget &Subtarget,
16675                                   SelectionDAG &DAG) {
16676   if (V2.isUndef()) {
16677     // Attempt to match VBROADCAST*128 subvector broadcast load.
16678     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16679     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16680     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16681         X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
16682       MVT MemVT = VT.getHalfNumVectorElementsVT();
16683       unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16684       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16685       if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
16686                                              VT, MemVT, Ld, Ofs, DAG))
16687         return BcstLd;
16688     }
16689 
16690     // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16691     if (Subtarget.hasAVX2())
16692       return SDValue();
16693   }
16694 
16695   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16696 
16697   SmallVector<int, 4> WidenedMask;
16698   if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16699     return SDValue();
16700 
16701   bool IsLowZero = (Zeroable & 0x3) == 0x3;
16702   bool IsHighZero = (Zeroable & 0xc) == 0xc;
16703 
16704   // Try to use an insert into a zero vector.
16705   if (WidenedMask[0] == 0 && IsHighZero) {
16706     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16707     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16708                               DAG.getIntPtrConstant(0, DL));
16709     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16710                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
16711                        DAG.getIntPtrConstant(0, DL));
16712   }
16713 
16714   // TODO: If minimizing size and one of the inputs is a zero vector and the
16715   // the zero vector has only one use, we could use a VPERM2X128 to save the
16716   // instruction bytes needed to explicitly generate the zero vector.
16717 
16718   // Blends are faster and handle all the non-lane-crossing cases.
16719   if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16720                                           Subtarget, DAG))
16721     return Blend;
16722 
16723   // If either input operand is a zero vector, use VPERM2X128 because its mask
16724   // allows us to replace the zero input with an implicit zero.
16725   if (!IsLowZero && !IsHighZero) {
16726     // Check for patterns which can be matched with a single insert of a 128-bit
16727     // subvector.
16728     bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16729     if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16730 
16731       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16732       // this will likely become vinsertf128 which can't fold a 256-bit memop.
16733       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16734         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16735         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16736                                      OnlyUsesV1 ? V1 : V2,
16737                                      DAG.getIntPtrConstant(0, DL));
16738         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16739                            DAG.getIntPtrConstant(2, DL));
16740       }
16741     }
16742 
16743     // Try to use SHUF128 if possible.
16744     if (Subtarget.hasVLX()) {
16745       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16746         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16747                             ((WidenedMask[1] % 2) << 1);
16748         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16749                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
16750       }
16751     }
16752   }
16753 
16754   // Otherwise form a 128-bit permutation. After accounting for undefs,
16755   // convert the 64-bit shuffle mask selection values into 128-bit
16756   // selection bits by dividing the indexes by 2 and shifting into positions
16757   // defined by a vperm2*128 instruction's immediate control byte.
16758 
16759   // The immediate permute control byte looks like this:
16760   //    [1:0] - select 128 bits from sources for low half of destination
16761   //    [2]   - ignore
16762   //    [3]   - zero low half of destination
16763   //    [5:4] - select 128 bits from sources for high half of destination
16764   //    [6]   - ignore
16765   //    [7]   - zero high half of destination
16766 
16767   assert((WidenedMask[0] >= 0 || IsLowZero) &&
16768          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16769 
16770   unsigned PermMask = 0;
16771   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
16772   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16773 
16774   // Check the immediate mask and replace unused sources with undef.
16775   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16776     V1 = DAG.getUNDEF(VT);
16777   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16778     V2 = DAG.getUNDEF(VT);
16779 
16780   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16781                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
16782 }
16783 
16784 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
16785 /// shuffling each lane.
16786 ///
16787 /// This attempts to create a repeated lane shuffle where each lane uses one
16788 /// or two of the lanes of the inputs. The lanes of the input vectors are
16789 /// shuffled in one or two independent shuffles to get the lanes into the
16790 /// position needed by the final shuffle.
16791 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16792     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16793     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16794   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
16795 
16796   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16797     return SDValue();
16798 
16799   int NumElts = Mask.size();
16800   int NumLanes = VT.getSizeInBits() / 128;
16801   int NumLaneElts = 128 / VT.getScalarSizeInBits();
16802   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16803   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16804 
16805   // First pass will try to fill in the RepeatMask from lanes that need two
16806   // sources.
16807   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16808     int Srcs[2] = {-1, -1};
16809     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16810     for (int i = 0; i != NumLaneElts; ++i) {
16811       int M = Mask[(Lane * NumLaneElts) + i];
16812       if (M < 0)
16813         continue;
16814       // Determine which of the possible input lanes (NumLanes from each source)
16815       // this element comes from. Assign that as one of the sources for this
16816       // lane. We can assign up to 2 sources for this lane. If we run out
16817       // sources we can't do anything.
16818       int LaneSrc = M / NumLaneElts;
16819       int Src;
16820       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16821         Src = 0;
16822       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16823         Src = 1;
16824       else
16825         return SDValue();
16826 
16827       Srcs[Src] = LaneSrc;
16828       InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16829     }
16830 
16831     // If this lane has two sources, see if it fits with the repeat mask so far.
16832     if (Srcs[1] < 0)
16833       continue;
16834 
16835     LaneSrcs[Lane][0] = Srcs[0];
16836     LaneSrcs[Lane][1] = Srcs[1];
16837 
16838     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16839       assert(M1.size() == M2.size() && "Unexpected mask size");
16840       for (int i = 0, e = M1.size(); i != e; ++i)
16841         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16842           return false;
16843       return true;
16844     };
16845 
16846     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16847       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16848       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16849         int M = Mask[i];
16850         if (M < 0)
16851           continue;
16852         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16853                "Unexpected mask element");
16854         MergedMask[i] = M;
16855       }
16856     };
16857 
16858     if (MatchMasks(InLaneMask, RepeatMask)) {
16859       // Merge this lane mask into the final repeat mask.
16860       MergeMasks(InLaneMask, RepeatMask);
16861       continue;
16862     }
16863 
16864     // Didn't find a match. Swap the operands and try again.
16865     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16866     ShuffleVectorSDNode::commuteMask(InLaneMask);
16867 
16868     if (MatchMasks(InLaneMask, RepeatMask)) {
16869       // Merge this lane mask into the final repeat mask.
16870       MergeMasks(InLaneMask, RepeatMask);
16871       continue;
16872     }
16873 
16874     // Couldn't find a match with the operands in either order.
16875     return SDValue();
16876   }
16877 
16878   // Now handle any lanes with only one source.
16879   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16880     // If this lane has already been processed, skip it.
16881     if (LaneSrcs[Lane][0] >= 0)
16882       continue;
16883 
16884     for (int i = 0; i != NumLaneElts; ++i) {
16885       int M = Mask[(Lane * NumLaneElts) + i];
16886       if (M < 0)
16887         continue;
16888 
16889       // If RepeatMask isn't defined yet we can define it ourself.
16890       if (RepeatMask[i] < 0)
16891         RepeatMask[i] = M % NumLaneElts;
16892 
16893       if (RepeatMask[i] < NumElts) {
16894         if (RepeatMask[i] != M % NumLaneElts)
16895           return SDValue();
16896         LaneSrcs[Lane][0] = M / NumLaneElts;
16897       } else {
16898         if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16899           return SDValue();
16900         LaneSrcs[Lane][1] = M / NumLaneElts;
16901       }
16902     }
16903 
16904     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16905       return SDValue();
16906   }
16907 
16908   SmallVector<int, 16> NewMask(NumElts, -1);
16909   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16910     int Src = LaneSrcs[Lane][0];
16911     for (int i = 0; i != NumLaneElts; ++i) {
16912       int M = -1;
16913       if (Src >= 0)
16914         M = Src * NumLaneElts + i;
16915       NewMask[Lane * NumLaneElts + i] = M;
16916     }
16917   }
16918   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16919   // Ensure we didn't get back the shuffle we started with.
16920   // FIXME: This is a hack to make up for some splat handling code in
16921   // getVectorShuffle.
16922   if (isa<ShuffleVectorSDNode>(NewV1) &&
16923       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16924     return SDValue();
16925 
16926   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16927     int Src = LaneSrcs[Lane][1];
16928     for (int i = 0; i != NumLaneElts; ++i) {
16929       int M = -1;
16930       if (Src >= 0)
16931         M = Src * NumLaneElts + i;
16932       NewMask[Lane * NumLaneElts + i] = M;
16933     }
16934   }
16935   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16936   // Ensure we didn't get back the shuffle we started with.
16937   // FIXME: This is a hack to make up for some splat handling code in
16938   // getVectorShuffle.
16939   if (isa<ShuffleVectorSDNode>(NewV2) &&
16940       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16941     return SDValue();
16942 
16943   for (int i = 0; i != NumElts; ++i) {
16944     NewMask[i] = RepeatMask[i % NumLaneElts];
16945     if (NewMask[i] < 0)
16946       continue;
16947 
16948     NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16949   }
16950   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16951 }
16952 
16953 /// If the input shuffle mask results in a vector that is undefined in all upper
16954 /// or lower half elements and that mask accesses only 2 halves of the
16955 /// shuffle's operands, return true. A mask of half the width with mask indexes
16956 /// adjusted to access the extracted halves of the original shuffle operands is
16957 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16958 /// lower half of each input operand is accessed.
16959 static bool
16960 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16961                    int &HalfIdx1, int &HalfIdx2) {
16962   assert((Mask.size() == HalfMask.size() * 2) &&
16963          "Expected input mask to be twice as long as output");
16964 
16965   // Exactly one half of the result must be undef to allow narrowing.
16966   bool UndefLower = isUndefLowerHalf(Mask);
16967   bool UndefUpper = isUndefUpperHalf(Mask);
16968   if (UndefLower == UndefUpper)
16969     return false;
16970 
16971   unsigned HalfNumElts = HalfMask.size();
16972   unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16973   HalfIdx1 = -1;
16974   HalfIdx2 = -1;
16975   for (unsigned i = 0; i != HalfNumElts; ++i) {
16976     int M = Mask[i + MaskIndexOffset];
16977     if (M < 0) {
16978       HalfMask[i] = M;
16979       continue;
16980     }
16981 
16982     // Determine which of the 4 half vectors this element is from.
16983     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16984     int HalfIdx = M / HalfNumElts;
16985 
16986     // Determine the element index into its half vector source.
16987     int HalfElt = M % HalfNumElts;
16988 
16989     // We can shuffle with up to 2 half vectors, set the new 'half'
16990     // shuffle mask accordingly.
16991     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16992       HalfMask[i] = HalfElt;
16993       HalfIdx1 = HalfIdx;
16994       continue;
16995     }
16996     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16997       HalfMask[i] = HalfElt + HalfNumElts;
16998       HalfIdx2 = HalfIdx;
16999       continue;
17000     }
17001 
17002     // Too many half vectors referenced.
17003     return false;
17004   }
17005 
17006   return true;
17007 }
17008 
17009 /// Given the output values from getHalfShuffleMask(), create a half width
17010 /// shuffle of extracted vectors followed by an insert back to full width.
17011 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17012                                      ArrayRef<int> HalfMask, int HalfIdx1,
17013                                      int HalfIdx2, bool UndefLower,
17014                                      SelectionDAG &DAG, bool UseConcat = false) {
17015   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
17016   assert(V1.getValueType().isSimple() && "Expecting only simple types");
17017 
17018   MVT VT = V1.getSimpleValueType();
17019   MVT HalfVT = VT.getHalfNumVectorElementsVT();
17020   unsigned HalfNumElts = HalfVT.getVectorNumElements();
17021 
17022   auto getHalfVector = [&](int HalfIdx) {
17023     if (HalfIdx < 0)
17024       return DAG.getUNDEF(HalfVT);
17025     SDValue V = (HalfIdx < 2 ? V1 : V2);
17026     HalfIdx = (HalfIdx % 2) * HalfNumElts;
17027     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17028                        DAG.getIntPtrConstant(HalfIdx, DL));
17029   };
17030 
17031   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17032   SDValue Half1 = getHalfVector(HalfIdx1);
17033   SDValue Half2 = getHalfVector(HalfIdx2);
17034   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17035   if (UseConcat) {
17036     SDValue Op0 = V;
17037     SDValue Op1 = DAG.getUNDEF(HalfVT);
17038     if (UndefLower)
17039       std::swap(Op0, Op1);
17040     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17041   }
17042 
17043   unsigned Offset = UndefLower ? HalfNumElts : 0;
17044   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17045                      DAG.getIntPtrConstant(Offset, DL));
17046 }
17047 
17048 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17049 /// This allows for fast cases such as subvector extraction/insertion
17050 /// or shuffling smaller vector types which can lower more efficiently.
17051 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17052                                          SDValue V2, ArrayRef<int> Mask,
17053                                          const X86Subtarget &Subtarget,
17054                                          SelectionDAG &DAG) {
17055   assert((VT.is256BitVector() || VT.is512BitVector()) &&
17056          "Expected 256-bit or 512-bit vector");
17057 
17058   bool UndefLower = isUndefLowerHalf(Mask);
17059   if (!UndefLower && !isUndefUpperHalf(Mask))
17060     return SDValue();
17061 
17062   assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
17063          "Completely undef shuffle mask should have been simplified already");
17064 
17065   // Upper half is undef and lower half is whole upper subvector.
17066   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17067   MVT HalfVT = VT.getHalfNumVectorElementsVT();
17068   unsigned HalfNumElts = HalfVT.getVectorNumElements();
17069   if (!UndefLower &&
17070       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17071     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17072                              DAG.getIntPtrConstant(HalfNumElts, DL));
17073     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17074                        DAG.getIntPtrConstant(0, DL));
17075   }
17076 
17077   // Lower half is undef and upper half is whole lower subvector.
17078   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17079   if (UndefLower &&
17080       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17081     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17082                              DAG.getIntPtrConstant(0, DL));
17083     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17084                        DAG.getIntPtrConstant(HalfNumElts, DL));
17085   }
17086 
17087   int HalfIdx1, HalfIdx2;
17088   SmallVector<int, 8> HalfMask(HalfNumElts);
17089   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17090     return SDValue();
17091 
17092   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
17093 
17094   // Only shuffle the halves of the inputs when useful.
17095   unsigned NumLowerHalves =
17096       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17097   unsigned NumUpperHalves =
17098       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17099   assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
17100 
17101   // Determine the larger pattern of undef/halves, then decide if it's worth
17102   // splitting the shuffle based on subtarget capabilities and types.
17103   unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17104   if (!UndefLower) {
17105     // XXXXuuuu: no insert is needed.
17106     // Always extract lowers when setting lower - these are all free subreg ops.
17107     if (NumUpperHalves == 0)
17108       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17109                                    UndefLower, DAG);
17110 
17111     if (NumUpperHalves == 1) {
17112       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17113       if (Subtarget.hasAVX2()) {
17114         // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17115         if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17116             !is128BitUnpackShuffleMask(HalfMask) &&
17117             (!isSingleSHUFPSMask(HalfMask) ||
17118              Subtarget.hasFastVariableCrossLaneShuffle()))
17119           return SDValue();
17120         // If this is a unary shuffle (assume that the 2nd operand is
17121         // canonicalized to undef), then we can use vpermpd. Otherwise, we
17122         // are better off extracting the upper half of 1 operand and using a
17123         // narrow shuffle.
17124         if (EltWidth == 64 && V2.isUndef())
17125           return SDValue();
17126       }
17127       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17128       if (Subtarget.hasAVX512() && VT.is512BitVector())
17129         return SDValue();
17130       // Extract + narrow shuffle is better than the wide alternative.
17131       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17132                                    UndefLower, DAG);
17133     }
17134 
17135     // Don't extract both uppers, instead shuffle and then extract.
17136     assert(NumUpperHalves == 2 && "Half vector count went wrong");
17137     return SDValue();
17138   }
17139 
17140   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17141   if (NumUpperHalves == 0) {
17142     // AVX2 has efficient 64-bit element cross-lane shuffles.
17143     // TODO: Refine to account for unary shuffle, splat, and other masks?
17144     if (Subtarget.hasAVX2() && EltWidth == 64)
17145       return SDValue();
17146     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17147     if (Subtarget.hasAVX512() && VT.is512BitVector())
17148       return SDValue();
17149     // Narrow shuffle + insert is better than the wide alternative.
17150     return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17151                                  UndefLower, DAG);
17152   }
17153 
17154   // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17155   return SDValue();
17156 }
17157 
17158 /// Test whether the specified input (0 or 1) is in-place blended by the
17159 /// given mask.
17160 ///
17161 /// This returns true if the elements from a particular input are already in the
17162 /// slot required by the given mask and require no permutation.
17163 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
17164   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
17165   int Size = Mask.size();
17166   for (int i = 0; i < Size; ++i)
17167     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
17168       return false;
17169 
17170   return true;
17171 }
17172 
17173 /// Handle case where shuffle sources are coming from the same 128-bit lane and
17174 /// every lane can be represented as the same repeating mask - allowing us to
17175 /// shuffle the sources with the repeating shuffle and then permute the result
17176 /// to the destination lanes.
17177 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17178     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17179     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17180   int NumElts = VT.getVectorNumElements();
17181   int NumLanes = VT.getSizeInBits() / 128;
17182   int NumLaneElts = NumElts / NumLanes;
17183 
17184   // On AVX2 we may be able to just shuffle the lowest elements and then
17185   // broadcast the result.
17186   if (Subtarget.hasAVX2()) {
17187     for (unsigned BroadcastSize : {16, 32, 64}) {
17188       if (BroadcastSize <= VT.getScalarSizeInBits())
17189         continue;
17190       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17191 
17192       // Attempt to match a repeating pattern every NumBroadcastElts,
17193       // accounting for UNDEFs but only references the lowest 128-bit
17194       // lane of the inputs.
17195       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17196         for (int i = 0; i != NumElts; i += NumBroadcastElts)
17197           for (int j = 0; j != NumBroadcastElts; ++j) {
17198             int M = Mask[i + j];
17199             if (M < 0)
17200               continue;
17201             int &R = RepeatMask[j];
17202             if (0 != ((M % NumElts) / NumLaneElts))
17203               return false;
17204             if (0 <= R && R != M)
17205               return false;
17206             R = M;
17207           }
17208         return true;
17209       };
17210 
17211       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17212       if (!FindRepeatingBroadcastMask(RepeatMask))
17213         continue;
17214 
17215       // Shuffle the (lowest) repeated elements in place for broadcast.
17216       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17217 
17218       // Shuffle the actual broadcast.
17219       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17220       for (int i = 0; i != NumElts; i += NumBroadcastElts)
17221         for (int j = 0; j != NumBroadcastElts; ++j)
17222           BroadcastMask[i + j] = j;
17223       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17224                                   BroadcastMask);
17225     }
17226   }
17227 
17228   // Bail if the shuffle mask doesn't cross 128-bit lanes.
17229   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17230     return SDValue();
17231 
17232   // Bail if we already have a repeated lane shuffle mask.
17233   SmallVector<int, 8> RepeatedShuffleMask;
17234   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
17235     return SDValue();
17236 
17237   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
17238   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
17239   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
17240   int NumSubLanes = NumLanes * SubLaneScale;
17241   int NumSubLaneElts = NumLaneElts / SubLaneScale;
17242 
17243   // Check that all the sources are coming from the same lane and see if we can
17244   // form a repeating shuffle mask (local to each sub-lane). At the same time,
17245   // determine the source sub-lane for each destination sub-lane.
17246   int TopSrcSubLane = -1;
17247   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17248   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
17249       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
17250       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
17251 
17252   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17253     // Extract the sub-lane mask, check that it all comes from the same lane
17254     // and normalize the mask entries to come from the first lane.
17255     int SrcLane = -1;
17256     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17257     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17258       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17259       if (M < 0)
17260         continue;
17261       int Lane = (M % NumElts) / NumLaneElts;
17262       if ((0 <= SrcLane) && (SrcLane != Lane))
17263         return SDValue();
17264       SrcLane = Lane;
17265       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17266       SubLaneMask[Elt] = LocalM;
17267     }
17268 
17269     // Whole sub-lane is UNDEF.
17270     if (SrcLane < 0)
17271       continue;
17272 
17273     // Attempt to match against the candidate repeated sub-lane masks.
17274     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17275       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17276         for (int i = 0; i != NumSubLaneElts; ++i) {
17277           if (M1[i] < 0 || M2[i] < 0)
17278             continue;
17279           if (M1[i] != M2[i])
17280             return false;
17281         }
17282         return true;
17283       };
17284 
17285       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17286       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
17287         continue;
17288 
17289       // Merge the sub-lane mask into the matching repeated sub-lane mask.
17290       for (int i = 0; i != NumSubLaneElts; ++i) {
17291         int M = SubLaneMask[i];
17292         if (M < 0)
17293           continue;
17294         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
17295                "Unexpected mask element");
17296         RepeatedSubLaneMask[i] = M;
17297       }
17298 
17299       // Track the top most source sub-lane - by setting the remaining to UNDEF
17300       // we can greatly simplify shuffle matching.
17301       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
17302       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
17303       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
17304       break;
17305     }
17306 
17307     // Bail if we failed to find a matching repeated sub-lane mask.
17308     if (Dst2SrcSubLanes[DstSubLane] < 0)
17309       return SDValue();
17310   }
17311   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
17312          "Unexpected source lane");
17313 
17314   // Create a repeating shuffle mask for the entire vector.
17315   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17316   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17317     int Lane = SubLane / SubLaneScale;
17318     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17319     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17320       int M = RepeatedSubLaneMask[Elt];
17321       if (M < 0)
17322         continue;
17323       int Idx = (SubLane * NumSubLaneElts) + Elt;
17324       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17325     }
17326   }
17327   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17328 
17329   // Shuffle each source sub-lane to its destination.
17330   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17331   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17332     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17333     if (SrcSubLane < 0)
17334       continue;
17335     for (int j = 0; j != NumSubLaneElts; ++j)
17336       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17337   }
17338 
17339   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17340                               SubLaneMask);
17341 }
17342 
17343 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17344                                    bool &ForceV1Zero, bool &ForceV2Zero,
17345                                    unsigned &ShuffleImm, ArrayRef<int> Mask,
17346                                    const APInt &Zeroable) {
17347   int NumElts = VT.getVectorNumElements();
17348   assert(VT.getScalarSizeInBits() == 64 &&
17349          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
17350          "Unexpected data type for VSHUFPD");
17351   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
17352          "Illegal shuffle mask");
17353 
17354   bool ZeroLane[2] = { true, true };
17355   for (int i = 0; i < NumElts; ++i)
17356     ZeroLane[i & 1] &= Zeroable[i];
17357 
17358   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
17359   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
17360   ShuffleImm = 0;
17361   bool ShufpdMask = true;
17362   bool CommutableMask = true;
17363   for (int i = 0; i < NumElts; ++i) {
17364     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
17365       continue;
17366     if (Mask[i] < 0)
17367       return false;
17368     int Val = (i & 6) + NumElts * (i & 1);
17369     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
17370     if (Mask[i] < Val || Mask[i] > Val + 1)
17371       ShufpdMask = false;
17372     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
17373       CommutableMask = false;
17374     ShuffleImm |= (Mask[i] % 2) << i;
17375   }
17376 
17377   if (!ShufpdMask && !CommutableMask)
17378     return false;
17379 
17380   if (!ShufpdMask && CommutableMask)
17381     std::swap(V1, V2);
17382 
17383   ForceV1Zero = ZeroLane[0];
17384   ForceV2Zero = ZeroLane[1];
17385   return true;
17386 }
17387 
17388 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
17389                                       SDValue V2, ArrayRef<int> Mask,
17390                                       const APInt &Zeroable,
17391                                       const X86Subtarget &Subtarget,
17392                                       SelectionDAG &DAG) {
17393   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
17394          "Unexpected data type for VSHUFPD");
17395 
17396   unsigned Immediate = 0;
17397   bool ForceV1Zero = false, ForceV2Zero = false;
17398   if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
17399                               Mask, Zeroable))
17400     return SDValue();
17401 
17402   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
17403   if (ForceV1Zero)
17404     V1 = getZeroVector(VT, Subtarget, DAG, DL);
17405   if (ForceV2Zero)
17406     V2 = getZeroVector(VT, Subtarget, DAG, DL);
17407 
17408   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
17409                      DAG.getTargetConstant(Immediate, DL, MVT::i8));
17410 }
17411 
17412 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17413 // by zeroable elements in the remaining 24 elements. Turn this into two
17414 // vmovqb instructions shuffled together.
17415 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
17416                                              SDValue V1, SDValue V2,
17417                                              ArrayRef<int> Mask,
17418                                              const APInt &Zeroable,
17419                                              SelectionDAG &DAG) {
17420   assert(VT == MVT::v32i8 && "Unexpected type!");
17421 
17422   // The first 8 indices should be every 8th element.
17423   if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
17424     return SDValue();
17425 
17426   // Remaining elements need to be zeroable.
17427   if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
17428     return SDValue();
17429 
17430   V1 = DAG.getBitcast(MVT::v4i64, V1);
17431   V2 = DAG.getBitcast(MVT::v4i64, V2);
17432 
17433   V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
17434   V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
17435 
17436   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
17437   // the upper bits of the result using an unpckldq.
17438   SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
17439                                         { 0, 1, 2, 3, 16, 17, 18, 19,
17440                                           4, 5, 6, 7, 20, 21, 22, 23 });
17441   // Insert the unpckldq into a zero vector to widen to v32i8.
17442   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
17443                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
17444                      DAG.getIntPtrConstant(0, DL));
17445 }
17446 
17447 
17448 /// Handle lowering of 4-lane 64-bit floating point shuffles.
17449 ///
17450 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17451 /// isn't available.
17452 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17453                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17454                                  const X86Subtarget &Subtarget,
17455                                  SelectionDAG &DAG) {
17456   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
17457   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
17458   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17459 
17460   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17461                                      Subtarget, DAG))
17462     return V;
17463 
17464   if (V2.isUndef()) {
17465     // Check for being able to broadcast a single element.
17466     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17467                                                     Mask, Subtarget, DAG))
17468       return Broadcast;
17469 
17470     // Use low duplicate instructions for masks that match their pattern.
17471     if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17472       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17473 
17474     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17475       // Non-half-crossing single input shuffles can be lowered with an
17476       // interleaved permutation.
17477       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17478                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17479       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17480                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17481     }
17482 
17483     // With AVX2 we have direct support for this permutation.
17484     if (Subtarget.hasAVX2())
17485       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17486                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17487 
17488     // Try to create an in-lane repeating shuffle mask and then shuffle the
17489     // results into the target lanes.
17490     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17491             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17492       return V;
17493 
17494     // Try to permute the lanes and then use a per-lane permute.
17495     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17496                                                         Mask, DAG, Subtarget))
17497       return V;
17498 
17499     // Otherwise, fall back.
17500     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17501                                                DAG, Subtarget);
17502   }
17503 
17504   // Use dedicated unpack instructions for masks that match their pattern.
17505   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
17506     return V;
17507 
17508   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17509                                           Zeroable, Subtarget, DAG))
17510     return Blend;
17511 
17512   // Check if the blend happens to exactly fit that of SHUFPD.
17513   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17514                                           Zeroable, Subtarget, DAG))
17515     return Op;
17516 
17517   // If we have lane crossing shuffles AND they don't all come from the lower
17518   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17519   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17520   // canonicalize to a blend of splat which isn't necessary for this combine.
17521   if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17522       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17523       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17524       (V2.getOpcode() != ISD::BUILD_VECTOR))
17525     if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
17526                                                        Mask, DAG))
17527       return Op;
17528 
17529   // If we have one input in place, then we can permute the other input and
17530   // blend the result.
17531   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17532     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17533                                                 Subtarget, DAG);
17534 
17535   // Try to create an in-lane repeating shuffle mask and then shuffle the
17536   // results into the target lanes.
17537   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17538           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17539     return V;
17540 
17541   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17542   // shuffle. However, if we have AVX2 and either inputs are already in place,
17543   // we will be able to shuffle even across lanes the other input in a single
17544   // instruction so skip this pattern.
17545   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
17546                                 isShuffleMaskInputInPlace(1, Mask))))
17547     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
17548             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17549       return V;
17550 
17551   // If we have VLX support, we can use VEXPAND.
17552   if (Subtarget.hasVLX())
17553     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
17554                                          DAG, Subtarget))
17555       return V;
17556 
17557   // If we have AVX2 then we always want to lower with a blend because an v4 we
17558   // can fully permute the elements.
17559   if (Subtarget.hasAVX2())
17560     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17561                                                 Subtarget, DAG);
17562 
17563   // Otherwise fall back on generic lowering.
17564   return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
17565                                     Subtarget, DAG);
17566 }
17567 
17568 /// Handle lowering of 4-lane 64-bit integer shuffles.
17569 ///
17570 /// This routine is only called when we have AVX2 and thus a reasonable
17571 /// instruction set for v4i64 shuffling..
17572 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17573                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17574                                  const X86Subtarget &Subtarget,
17575                                  SelectionDAG &DAG) {
17576   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17577   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17578   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17579   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
17580 
17581   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17582                                      Subtarget, DAG))
17583     return V;
17584 
17585   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17586                                           Zeroable, Subtarget, DAG))
17587     return Blend;
17588 
17589   // Check for being able to broadcast a single element.
17590   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17591                                                   Subtarget, DAG))
17592     return Broadcast;
17593 
17594   if (V2.isUndef()) {
17595     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17596     // can use lower latency instructions that will operate on both lanes.
17597     SmallVector<int, 2> RepeatedMask;
17598     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17599       SmallVector<int, 4> PSHUFDMask;
17600       narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17601       return DAG.getBitcast(
17602           MVT::v4i64,
17603           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17604                       DAG.getBitcast(MVT::v8i32, V1),
17605                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17606     }
17607 
17608     // AVX2 provides a direct instruction for permuting a single input across
17609     // lanes.
17610     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17611                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17612   }
17613 
17614   // Try to use shift instructions.
17615   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17616                                           Zeroable, Subtarget, DAG))
17617     return Shift;
17618 
17619   // If we have VLX support, we can use VALIGN or VEXPAND.
17620   if (Subtarget.hasVLX()) {
17621     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17622                                               Subtarget, DAG))
17623       return Rotate;
17624 
17625     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17626                                          DAG, Subtarget))
17627       return V;
17628   }
17629 
17630   // Try to use PALIGNR.
17631   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17632                                                 Subtarget, DAG))
17633     return Rotate;
17634 
17635   // Use dedicated unpack instructions for masks that match their pattern.
17636   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17637     return V;
17638 
17639   // If we have one input in place, then we can permute the other input and
17640   // blend the result.
17641   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17642     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17643                                                 Subtarget, DAG);
17644 
17645   // Try to create an in-lane repeating shuffle mask and then shuffle the
17646   // results into the target lanes.
17647   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17648           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17649     return V;
17650 
17651   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17652   // shuffle. However, if we have AVX2 and either inputs are already in place,
17653   // we will be able to shuffle even across lanes the other input in a single
17654   // instruction so skip this pattern.
17655   if (!isShuffleMaskInputInPlace(0, Mask) &&
17656       !isShuffleMaskInputInPlace(1, Mask))
17657     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17658             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17659       return Result;
17660 
17661   // Otherwise fall back on generic blend lowering.
17662   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17663                                               Subtarget, DAG);
17664 }
17665 
17666 /// Handle lowering of 8-lane 32-bit floating point shuffles.
17667 ///
17668 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17669 /// isn't available.
17670 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17671                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17672                                  const X86Subtarget &Subtarget,
17673                                  SelectionDAG &DAG) {
17674   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17675   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17676   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17677 
17678   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17679                                           Zeroable, Subtarget, DAG))
17680     return Blend;
17681 
17682   // Check for being able to broadcast a single element.
17683   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17684                                                   Subtarget, DAG))
17685     return Broadcast;
17686 
17687   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17688   // options to efficiently lower the shuffle.
17689   SmallVector<int, 4> RepeatedMask;
17690   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17691     assert(RepeatedMask.size() == 4 &&
17692            "Repeated masks must be half the mask width!");
17693 
17694     // Use even/odd duplicate instructions for masks that match their pattern.
17695     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17696       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17697     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17698       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17699 
17700     if (V2.isUndef())
17701       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17702                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17703 
17704     // Use dedicated unpack instructions for masks that match their pattern.
17705     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17706       return V;
17707 
17708     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17709     // have already handled any direct blends.
17710     return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17711   }
17712 
17713   // Try to create an in-lane repeating shuffle mask and then shuffle the
17714   // results into the target lanes.
17715   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17716           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17717     return V;
17718 
17719   // If we have a single input shuffle with different shuffle patterns in the
17720   // two 128-bit lanes use the variable mask to VPERMILPS.
17721   if (V2.isUndef()) {
17722     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17723       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17724       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17725     }
17726     if (Subtarget.hasAVX2()) {
17727       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17728       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17729     }
17730     // Otherwise, fall back.
17731     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17732                                                DAG, Subtarget);
17733   }
17734 
17735   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17736   // shuffle.
17737   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17738           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17739     return Result;
17740 
17741   // If we have VLX support, we can use VEXPAND.
17742   if (Subtarget.hasVLX())
17743     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17744                                          DAG, Subtarget))
17745       return V;
17746 
17747   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17748   // since after split we get a more efficient code using vpunpcklwd and
17749   // vpunpckhwd instrs than vblend.
17750   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17751     return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17752                                       DAG);
17753 
17754   // If we have AVX2 then we always want to lower with a blend because at v8 we
17755   // can fully permute the elements.
17756   if (Subtarget.hasAVX2())
17757     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17758                                                 Subtarget, DAG);
17759 
17760   // Otherwise fall back on generic lowering.
17761   return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17762                                     Subtarget, DAG);
17763 }
17764 
17765 /// Handle lowering of 8-lane 32-bit integer shuffles.
17766 ///
17767 /// This routine is only called when we have AVX2 and thus a reasonable
17768 /// instruction set for v8i32 shuffling..
17769 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17770                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17771                                  const X86Subtarget &Subtarget,
17772                                  SelectionDAG &DAG) {
17773   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17774   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17775   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17776   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17777 
17778   // Whenever we can lower this as a zext, that instruction is strictly faster
17779   // than any alternative. It also allows us to fold memory operands into the
17780   // shuffle in many cases.
17781   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17782                                                    Zeroable, Subtarget, DAG))
17783     return ZExt;
17784 
17785   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17786   // since after split we get a more efficient code than vblend by using
17787   // vpunpcklwd and vpunpckhwd instrs.
17788   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17789       !Subtarget.hasAVX512())
17790     return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17791                                       DAG);
17792 
17793   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17794                                           Zeroable, Subtarget, DAG))
17795     return Blend;
17796 
17797   // Check for being able to broadcast a single element.
17798   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17799                                                   Subtarget, DAG))
17800     return Broadcast;
17801 
17802   // If the shuffle mask is repeated in each 128-bit lane we can use more
17803   // efficient instructions that mirror the shuffles across the two 128-bit
17804   // lanes.
17805   SmallVector<int, 4> RepeatedMask;
17806   bool Is128BitLaneRepeatedShuffle =
17807       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17808   if (Is128BitLaneRepeatedShuffle) {
17809     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17810     if (V2.isUndef())
17811       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17812                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17813 
17814     // Use dedicated unpack instructions for masks that match their pattern.
17815     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17816       return V;
17817   }
17818 
17819   // Try to use shift instructions.
17820   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17821                                           Zeroable, Subtarget, DAG))
17822     return Shift;
17823 
17824   // If we have VLX support, we can use VALIGN or EXPAND.
17825   if (Subtarget.hasVLX()) {
17826     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17827                                               Subtarget, DAG))
17828       return Rotate;
17829 
17830     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17831                                          DAG, Subtarget))
17832       return V;
17833   }
17834 
17835   // Try to use byte rotation instructions.
17836   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17837                                                 Subtarget, DAG))
17838     return Rotate;
17839 
17840   // Try to create an in-lane repeating shuffle mask and then shuffle the
17841   // results into the target lanes.
17842   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17843           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17844     return V;
17845 
17846   if (V2.isUndef()) {
17847     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17848     // because that should be faster than the variable permute alternatives.
17849     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17850       return V;
17851 
17852     // If the shuffle patterns aren't repeated but it's a single input, directly
17853     // generate a cross-lane VPERMD instruction.
17854     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17855     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17856   }
17857 
17858   // Assume that a single SHUFPS is faster than an alternative sequence of
17859   // multiple instructions (even if the CPU has a domain penalty).
17860   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17861   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17862     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17863     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17864     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17865                                             CastV1, CastV2, DAG);
17866     return DAG.getBitcast(MVT::v8i32, ShufPS);
17867   }
17868 
17869   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17870   // shuffle.
17871   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17872           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17873     return Result;
17874 
17875   // Otherwise fall back on generic blend lowering.
17876   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17877                                               Subtarget, DAG);
17878 }
17879 
17880 /// Handle lowering of 16-lane 16-bit integer shuffles.
17881 ///
17882 /// This routine is only called when we have AVX2 and thus a reasonable
17883 /// instruction set for v16i16 shuffling..
17884 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17885                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17886                                   const X86Subtarget &Subtarget,
17887                                   SelectionDAG &DAG) {
17888   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17889   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17890   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17891   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17892 
17893   // Whenever we can lower this as a zext, that instruction is strictly faster
17894   // than any alternative. It also allows us to fold memory operands into the
17895   // shuffle in many cases.
17896   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17897           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17898     return ZExt;
17899 
17900   // Check for being able to broadcast a single element.
17901   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17902                                                   Subtarget, DAG))
17903     return Broadcast;
17904 
17905   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17906                                           Zeroable, Subtarget, DAG))
17907     return Blend;
17908 
17909   // Use dedicated unpack instructions for masks that match their pattern.
17910   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17911     return V;
17912 
17913   // Use dedicated pack instructions for masks that match their pattern.
17914   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17915                                        Subtarget))
17916     return V;
17917 
17918   // Try to use lower using a truncation.
17919   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17920                                        Subtarget, DAG))
17921     return V;
17922 
17923   // Try to use shift instructions.
17924   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17925                                           Zeroable, Subtarget, DAG))
17926     return Shift;
17927 
17928   // Try to use byte rotation instructions.
17929   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17930                                                 Subtarget, DAG))
17931     return Rotate;
17932 
17933   // Try to create an in-lane repeating shuffle mask and then shuffle the
17934   // results into the target lanes.
17935   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17936           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17937     return V;
17938 
17939   if (V2.isUndef()) {
17940     // Try to use bit rotation instructions.
17941     if (SDValue Rotate =
17942             lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17943       return Rotate;
17944 
17945     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17946     // because that should be faster than the variable permute alternatives.
17947     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17948       return V;
17949 
17950     // There are no generalized cross-lane shuffle operations available on i16
17951     // element types.
17952     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17953       if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17954               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17955         return V;
17956 
17957       return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17958                                                  DAG, Subtarget);
17959     }
17960 
17961     SmallVector<int, 8> RepeatedMask;
17962     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17963       // As this is a single-input shuffle, the repeated mask should be
17964       // a strictly valid v8i16 mask that we can pass through to the v8i16
17965       // lowering to handle even the v16 case.
17966       return lowerV8I16GeneralSingleInputShuffle(
17967           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17968     }
17969   }
17970 
17971   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17972                                               Zeroable, Subtarget, DAG))
17973     return PSHUFB;
17974 
17975   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17976   if (Subtarget.hasBWI())
17977     return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17978 
17979   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17980   // shuffle.
17981   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17982           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17983     return Result;
17984 
17985   // Try to permute the lanes and then use a per-lane permute.
17986   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17987           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17988     return V;
17989 
17990   // Otherwise fall back on generic lowering.
17991   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17992                                     Subtarget, DAG);
17993 }
17994 
17995 /// Handle lowering of 32-lane 8-bit integer shuffles.
17996 ///
17997 /// This routine is only called when we have AVX2 and thus a reasonable
17998 /// instruction set for v32i8 shuffling..
17999 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18000                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18001                                  const X86Subtarget &Subtarget,
18002                                  SelectionDAG &DAG) {
18003   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
18004   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
18005   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
18006   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
18007 
18008   // Whenever we can lower this as a zext, that instruction is strictly faster
18009   // than any alternative. It also allows us to fold memory operands into the
18010   // shuffle in many cases.
18011   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18012                                                    Zeroable, Subtarget, DAG))
18013     return ZExt;
18014 
18015   // Check for being able to broadcast a single element.
18016   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18017                                                   Subtarget, DAG))
18018     return Broadcast;
18019 
18020   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18021                                           Zeroable, Subtarget, DAG))
18022     return Blend;
18023 
18024   // Use dedicated unpack instructions for masks that match their pattern.
18025   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18026     return V;
18027 
18028   // Use dedicated pack instructions for masks that match their pattern.
18029   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18030                                        Subtarget))
18031     return V;
18032 
18033   // Try to use lower using a truncation.
18034   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18035                                        Subtarget, DAG))
18036     return V;
18037 
18038   // Try to use shift instructions.
18039   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
18040                                           Zeroable, Subtarget, DAG))
18041     return Shift;
18042 
18043   // Try to use byte rotation instructions.
18044   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18045                                                 Subtarget, DAG))
18046     return Rotate;
18047 
18048   // Try to use bit rotation instructions.
18049   if (V2.isUndef())
18050     if (SDValue Rotate =
18051             lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18052       return Rotate;
18053 
18054   // Try to create an in-lane repeating shuffle mask and then shuffle the
18055   // results into the target lanes.
18056   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18057           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18058     return V;
18059 
18060   // There are no generalized cross-lane shuffle operations available on i8
18061   // element types.
18062   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18063     // Try to produce a fixed cross-128-bit lane permute followed by unpack
18064     // because that should be faster than the variable permute alternatives.
18065     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18066       return V;
18067 
18068     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18069             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18070       return V;
18071 
18072     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18073                                                DAG, Subtarget);
18074   }
18075 
18076   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18077                                               Zeroable, Subtarget, DAG))
18078     return PSHUFB;
18079 
18080   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18081   if (Subtarget.hasVBMI())
18082     return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18083 
18084   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18085   // shuffle.
18086   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18087           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18088     return Result;
18089 
18090   // Try to permute the lanes and then use a per-lane permute.
18091   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18092           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18093     return V;
18094 
18095   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18096   // by zeroable elements in the remaining 24 elements. Turn this into two
18097   // vmovqb instructions shuffled together.
18098   if (Subtarget.hasVLX())
18099     if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18100                                                   Mask, Zeroable, DAG))
18101       return V;
18102 
18103   // Otherwise fall back on generic lowering.
18104   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
18105                                     Subtarget, DAG);
18106 }
18107 
18108 /// High-level routine to lower various 256-bit x86 vector shuffles.
18109 ///
18110 /// This routine either breaks down the specific type of a 256-bit x86 vector
18111 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
18112 /// together based on the available instructions.
18113 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18114                                   SDValue V1, SDValue V2, const APInt &Zeroable,
18115                                   const X86Subtarget &Subtarget,
18116                                   SelectionDAG &DAG) {
18117   // If we have a single input to the zero element, insert that into V1 if we
18118   // can do so cheaply.
18119   int NumElts = VT.getVectorNumElements();
18120   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18121 
18122   if (NumV2Elements == 1 && Mask[0] >= NumElts)
18123     if (SDValue Insertion = lowerShuffleAsElementInsertion(
18124             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18125       return Insertion;
18126 
18127   // Handle special cases where the lower or upper half is UNDEF.
18128   if (SDValue V =
18129           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18130     return V;
18131 
18132   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
18133   // can check for those subtargets here and avoid much of the subtarget
18134   // querying in the per-vector-type lowering routines. With AVX1 we have
18135   // essentially *zero* ability to manipulate a 256-bit vector with integer
18136   // types. Since we'll use floating point types there eventually, just
18137   // immediately cast everything to a float and operate entirely in that domain.
18138   if (VT.isInteger() && !Subtarget.hasAVX2()) {
18139     int ElementBits = VT.getScalarSizeInBits();
18140     if (ElementBits < 32) {
18141       // No floating point type available, if we can't use the bit operations
18142       // for masking/blending then decompose into 128-bit vectors.
18143       if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18144                                             Subtarget, DAG))
18145         return V;
18146       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18147         return V;
18148       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18149     }
18150 
18151     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
18152                                 VT.getVectorNumElements());
18153     V1 = DAG.getBitcast(FpVT, V1);
18154     V2 = DAG.getBitcast(FpVT, V2);
18155     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
18156   }
18157 
18158   if (VT == MVT::v16f16) {
18159     V1 = DAG.getBitcast(MVT::v16i16, V1);
18160     V2 = DAG.getBitcast(MVT::v16i16, V2);
18161     return DAG.getBitcast(MVT::v16f16,
18162                           DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
18163   }
18164 
18165   switch (VT.SimpleTy) {
18166   case MVT::v4f64:
18167     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18168   case MVT::v4i64:
18169     return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18170   case MVT::v8f32:
18171     return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18172   case MVT::v8i32:
18173     return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18174   case MVT::v16i16:
18175     return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18176   case MVT::v32i8:
18177     return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18178 
18179   default:
18180     llvm_unreachable("Not a valid 256-bit x86 vector type!");
18181   }
18182 }
18183 
18184 /// Try to lower a vector shuffle as a 128-bit shuffles.
18185 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
18186                                   const APInt &Zeroable, SDValue V1, SDValue V2,
18187                                   const X86Subtarget &Subtarget,
18188                                   SelectionDAG &DAG) {
18189   assert(VT.getScalarSizeInBits() == 64 &&
18190          "Unexpected element type size for 128bit shuffle.");
18191 
18192   // To handle 256 bit vector requires VLX and most probably
18193   // function lowerV2X128VectorShuffle() is better solution.
18194   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
18195 
18196   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
18197   SmallVector<int, 4> Widened128Mask;
18198   if (!canWidenShuffleElements(Mask, Widened128Mask))
18199     return SDValue();
18200   assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
18201 
18202   // Try to use an insert into a zero vector.
18203   if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
18204       (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
18205     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
18206     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
18207     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
18208                               DAG.getIntPtrConstant(0, DL));
18209     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18210                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
18211                        DAG.getIntPtrConstant(0, DL));
18212   }
18213 
18214   // Check for patterns which can be matched with a single insert of a 256-bit
18215   // subvector.
18216   bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
18217   if (OnlyUsesV1 ||
18218       isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
18219     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
18220     SDValue SubVec =
18221         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
18222                     DAG.getIntPtrConstant(0, DL));
18223     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
18224                        DAG.getIntPtrConstant(4, DL));
18225   }
18226 
18227   // See if this is an insertion of the lower 128-bits of V2 into V1.
18228   bool IsInsert = true;
18229   int V2Index = -1;
18230   for (int i = 0; i < 4; ++i) {
18231     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
18232     if (Widened128Mask[i] < 0)
18233       continue;
18234 
18235     // Make sure all V1 subvectors are in place.
18236     if (Widened128Mask[i] < 4) {
18237       if (Widened128Mask[i] != i) {
18238         IsInsert = false;
18239         break;
18240       }
18241     } else {
18242       // Make sure we only have a single V2 index and its the lowest 128-bits.
18243       if (V2Index >= 0 || Widened128Mask[i] != 4) {
18244         IsInsert = false;
18245         break;
18246       }
18247       V2Index = i;
18248     }
18249   }
18250   if (IsInsert && V2Index >= 0) {
18251     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
18252     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
18253                                  DAG.getIntPtrConstant(0, DL));
18254     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
18255   }
18256 
18257   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
18258   // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
18259   // possible we at least ensure the lanes stay sequential to help later
18260   // combines.
18261   SmallVector<int, 2> Widened256Mask;
18262   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
18263     Widened128Mask.clear();
18264     narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
18265   }
18266 
18267   // Try to lower to vshuf64x2/vshuf32x4.
18268   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
18269   unsigned PermMask = 0;
18270   // Insure elements came from the same Op.
18271   for (int i = 0; i < 4; ++i) {
18272     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
18273     if (Widened128Mask[i] < 0)
18274       continue;
18275 
18276     SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
18277     unsigned OpIndex = i / 2;
18278     if (Ops[OpIndex].isUndef())
18279       Ops[OpIndex] = Op;
18280     else if (Ops[OpIndex] != Op)
18281       return SDValue();
18282 
18283     // Convert the 128-bit shuffle mask selection values into 128-bit selection
18284     // bits defined by a vshuf64x2 instruction's immediate control byte.
18285     PermMask |= (Widened128Mask[i] % 4) << (i * 2);
18286   }
18287 
18288   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
18289                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
18290 }
18291 
18292 /// Handle lowering of 8-lane 64-bit floating point shuffles.
18293 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18294                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18295                                  const X86Subtarget &Subtarget,
18296                                  SelectionDAG &DAG) {
18297   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
18298   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
18299   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18300 
18301   if (V2.isUndef()) {
18302     // Use low duplicate instructions for masks that match their pattern.
18303     if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
18304       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
18305 
18306     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
18307       // Non-half-crossing single input shuffles can be lowered with an
18308       // interleaved permutation.
18309       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18310                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18311                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18312                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18313       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18314                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18315     }
18316 
18317     SmallVector<int, 4> RepeatedMask;
18318     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18319       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18320                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18321   }
18322 
18323   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18324                                            V2, Subtarget, DAG))
18325     return Shuf128;
18326 
18327   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
18328     return Unpck;
18329 
18330   // Check if the blend happens to exactly fit that of SHUFPD.
18331   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18332                                           Zeroable, Subtarget, DAG))
18333     return Op;
18334 
18335   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
18336                                        DAG, Subtarget))
18337     return V;
18338 
18339   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18340                                           Zeroable, Subtarget, DAG))
18341     return Blend;
18342 
18343   return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18344 }
18345 
18346 /// Handle lowering of 16-lane 32-bit floating point shuffles.
18347 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18348                                   const APInt &Zeroable, SDValue V1, SDValue V2,
18349                                   const X86Subtarget &Subtarget,
18350                                   SelectionDAG &DAG) {
18351   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
18352   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
18353   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18354 
18355   // If the shuffle mask is repeated in each 128-bit lane, we have many more
18356   // options to efficiently lower the shuffle.
18357   SmallVector<int, 4> RepeatedMask;
18358   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18359     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18360 
18361     // Use even/odd duplicate instructions for masks that match their pattern.
18362     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18363       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18364     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18365       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18366 
18367     if (V2.isUndef())
18368       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18369                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18370 
18371     // Use dedicated unpack instructions for masks that match their pattern.
18372     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
18373       return V;
18374 
18375     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18376                                             Zeroable, Subtarget, DAG))
18377       return Blend;
18378 
18379     // Otherwise, fall back to a SHUFPS sequence.
18380     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18381   }
18382 
18383   // Try to create an in-lane repeating shuffle mask and then shuffle the
18384   // results into the target lanes.
18385   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18386           DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18387     return V;
18388 
18389   // If we have a single input shuffle with different shuffle patterns in the
18390   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18391   if (V2.isUndef() &&
18392       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18393     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18394     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18395   }
18396 
18397   // If we have AVX512F support, we can use VEXPAND.
18398   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
18399                                              V1, V2, DAG, Subtarget))
18400     return V;
18401 
18402   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18403 }
18404 
18405 /// Handle lowering of 8-lane 64-bit integer shuffles.
18406 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18407                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18408                                  const X86Subtarget &Subtarget,
18409                                  SelectionDAG &DAG) {
18410   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18411   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18412   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18413 
18414   if (V2.isUndef()) {
18415     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18416     // can use lower latency instructions that will operate on all four
18417     // 128-bit lanes.
18418     SmallVector<int, 2> Repeated128Mask;
18419     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18420       SmallVector<int, 4> PSHUFDMask;
18421       narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18422       return DAG.getBitcast(
18423           MVT::v8i64,
18424           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18425                       DAG.getBitcast(MVT::v16i32, V1),
18426                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18427     }
18428 
18429     SmallVector<int, 4> Repeated256Mask;
18430     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18431       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18432                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18433   }
18434 
18435   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18436                                            V2, Subtarget, DAG))
18437     return Shuf128;
18438 
18439   // Try to use shift instructions.
18440   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
18441                                           Zeroable, Subtarget, DAG))
18442     return Shift;
18443 
18444   // Try to use VALIGN.
18445   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18446                                             Subtarget, DAG))
18447     return Rotate;
18448 
18449   // Try to use PALIGNR.
18450   if (Subtarget.hasBWI())
18451     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18452                                                   Subtarget, DAG))
18453       return Rotate;
18454 
18455   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
18456     return Unpck;
18457 
18458   // If we have AVX512F support, we can use VEXPAND.
18459   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
18460                                        DAG, Subtarget))
18461     return V;
18462 
18463   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18464                                           Zeroable, Subtarget, DAG))
18465     return Blend;
18466 
18467   return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18468 }
18469 
18470 /// Handle lowering of 16-lane 32-bit integer shuffles.
18471 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18472                                   const APInt &Zeroable, SDValue V1, SDValue V2,
18473                                   const X86Subtarget &Subtarget,
18474                                   SelectionDAG &DAG) {
18475   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18476   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18477   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18478 
18479   // Whenever we can lower this as a zext, that instruction is strictly faster
18480   // than any alternative. It also allows us to fold memory operands into the
18481   // shuffle in many cases.
18482   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18483           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18484     return ZExt;
18485 
18486   // If the shuffle mask is repeated in each 128-bit lane we can use more
18487   // efficient instructions that mirror the shuffles across the four 128-bit
18488   // lanes.
18489   SmallVector<int, 4> RepeatedMask;
18490   bool Is128BitLaneRepeatedShuffle =
18491       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18492   if (Is128BitLaneRepeatedShuffle) {
18493     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18494     if (V2.isUndef())
18495       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18496                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18497 
18498     // Use dedicated unpack instructions for masks that match their pattern.
18499     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
18500       return V;
18501   }
18502 
18503   // Try to use shift instructions.
18504   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
18505                                           Zeroable, Subtarget, DAG))
18506     return Shift;
18507 
18508   // Try to use VALIGN.
18509   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18510                                             Subtarget, DAG))
18511     return Rotate;
18512 
18513   // Try to use byte rotation instructions.
18514   if (Subtarget.hasBWI())
18515     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18516                                                   Subtarget, DAG))
18517       return Rotate;
18518 
18519   // Assume that a single SHUFPS is faster than using a permv shuffle.
18520   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18521   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18522     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18523     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18524     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18525                                             CastV1, CastV2, DAG);
18526     return DAG.getBitcast(MVT::v16i32, ShufPS);
18527   }
18528 
18529   // Try to create an in-lane repeating shuffle mask and then shuffle the
18530   // results into the target lanes.
18531   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18532           DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18533     return V;
18534 
18535   // If we have AVX512F support, we can use VEXPAND.
18536   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
18537                                        DAG, Subtarget))
18538     return V;
18539 
18540   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18541                                           Zeroable, Subtarget, DAG))
18542     return Blend;
18543 
18544   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18545 }
18546 
18547 /// Handle lowering of 32-lane 16-bit integer shuffles.
18548 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18549                                   const APInt &Zeroable, SDValue V1, SDValue V2,
18550                                   const X86Subtarget &Subtarget,
18551                                   SelectionDAG &DAG) {
18552   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18553   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18554   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
18555   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
18556 
18557   // Whenever we can lower this as a zext, that instruction is strictly faster
18558   // than any alternative. It also allows us to fold memory operands into the
18559   // shuffle in many cases.
18560   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18561           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18562     return ZExt;
18563 
18564   // Use dedicated unpack instructions for masks that match their pattern.
18565   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
18566     return V;
18567 
18568   // Use dedicated pack instructions for masks that match their pattern.
18569   if (SDValue V =
18570           lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
18571     return V;
18572 
18573   // Try to use shift instructions.
18574   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
18575                                           Zeroable, Subtarget, DAG))
18576     return Shift;
18577 
18578   // Try to use byte rotation instructions.
18579   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18580                                                 Subtarget, DAG))
18581     return Rotate;
18582 
18583   if (V2.isUndef()) {
18584     // Try to use bit rotation instructions.
18585     if (SDValue Rotate =
18586             lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18587       return Rotate;
18588 
18589     SmallVector<int, 8> RepeatedMask;
18590     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18591       // As this is a single-input shuffle, the repeated mask should be
18592       // a strictly valid v8i16 mask that we can pass through to the v8i16
18593       // lowering to handle even the v32 case.
18594       return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18595                                                  RepeatedMask, Subtarget, DAG);
18596     }
18597   }
18598 
18599   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18600                                           Zeroable, Subtarget, DAG))
18601     return Blend;
18602 
18603   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18604                                               Zeroable, Subtarget, DAG))
18605     return PSHUFB;
18606 
18607   return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18608 }
18609 
18610 /// Handle lowering of 64-lane 8-bit integer shuffles.
18611 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18612                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18613                                  const X86Subtarget &Subtarget,
18614                                  SelectionDAG &DAG) {
18615   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18616   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18617   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
18618   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
18619 
18620   // Whenever we can lower this as a zext, that instruction is strictly faster
18621   // than any alternative. It also allows us to fold memory operands into the
18622   // shuffle in many cases.
18623   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18624           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18625     return ZExt;
18626 
18627   // Use dedicated unpack instructions for masks that match their pattern.
18628   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18629     return V;
18630 
18631   // Use dedicated pack instructions for masks that match their pattern.
18632   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18633                                        Subtarget))
18634     return V;
18635 
18636   // Try to use shift instructions.
18637   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18638                                           Zeroable, Subtarget, DAG))
18639     return Shift;
18640 
18641   // Try to use byte rotation instructions.
18642   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18643                                                 Subtarget, DAG))
18644     return Rotate;
18645 
18646   // Try to use bit rotation instructions.
18647   if (V2.isUndef())
18648     if (SDValue Rotate =
18649             lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18650       return Rotate;
18651 
18652   // Lower as AND if possible.
18653   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18654                                              Zeroable, Subtarget, DAG))
18655     return Masked;
18656 
18657   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18658                                               Zeroable, Subtarget, DAG))
18659     return PSHUFB;
18660 
18661   // VBMI can use VPERMV/VPERMV3 byte shuffles.
18662   if (Subtarget.hasVBMI())
18663     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18664 
18665   // Try to create an in-lane repeating shuffle mask and then shuffle the
18666   // results into the target lanes.
18667   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18668           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18669     return V;
18670 
18671   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18672                                           Zeroable, Subtarget, DAG))
18673     return Blend;
18674 
18675   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18676   // shuffle.
18677   if (!V2.isUndef())
18678     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18679             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18680       return Result;
18681 
18682   // FIXME: Implement direct support for this type!
18683   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18684 }
18685 
18686 /// High-level routine to lower various 512-bit x86 vector shuffles.
18687 ///
18688 /// This routine either breaks down the specific type of a 512-bit x86 vector
18689 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
18690 /// together based on the available instructions.
18691 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18692                                   MVT VT, SDValue V1, SDValue V2,
18693                                   const APInt &Zeroable,
18694                                   const X86Subtarget &Subtarget,
18695                                   SelectionDAG &DAG) {
18696   assert(Subtarget.hasAVX512() &&
18697          "Cannot lower 512-bit vectors w/ basic ISA!");
18698 
18699   // If we have a single input to the zero element, insert that into V1 if we
18700   // can do so cheaply.
18701   int NumElts = Mask.size();
18702   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18703 
18704   if (NumV2Elements == 1 && Mask[0] >= NumElts)
18705     if (SDValue Insertion = lowerShuffleAsElementInsertion(
18706             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18707       return Insertion;
18708 
18709   // Handle special cases where the lower or upper half is UNDEF.
18710   if (SDValue V =
18711           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18712     return V;
18713 
18714   // Check for being able to broadcast a single element.
18715   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18716                                                   Subtarget, DAG))
18717     return Broadcast;
18718 
18719   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18720     // Try using bit ops for masking and blending before falling back to
18721     // splitting.
18722     if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18723                                           Subtarget, DAG))
18724       return V;
18725     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18726       return V;
18727 
18728     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18729   }
18730 
18731   if (VT == MVT::v32f16) {
18732     V1 = DAG.getBitcast(MVT::v32i16, V1);
18733     V2 = DAG.getBitcast(MVT::v32i16, V2);
18734     return DAG.getBitcast(MVT::v32f16,
18735                           DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18736   }
18737 
18738   // Dispatch to each element type for lowering. If we don't have support for
18739   // specific element type shuffles at 512 bits, immediately split them and
18740   // lower them. Each lowering routine of a given type is allowed to assume that
18741   // the requisite ISA extensions for that element type are available.
18742   switch (VT.SimpleTy) {
18743   case MVT::v8f64:
18744     return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18745   case MVT::v16f32:
18746     return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18747   case MVT::v8i64:
18748     return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18749   case MVT::v16i32:
18750     return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18751   case MVT::v32i16:
18752     return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18753   case MVT::v64i8:
18754     return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18755 
18756   default:
18757     llvm_unreachable("Not a valid 512-bit x86 vector type!");
18758   }
18759 }
18760 
18761 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18762                                          MVT VT, SDValue V1, SDValue V2,
18763                                          const X86Subtarget &Subtarget,
18764                                          SelectionDAG &DAG) {
18765   // Shuffle should be unary.
18766   if (!V2.isUndef())
18767     return SDValue();
18768 
18769   int ShiftAmt = -1;
18770   int NumElts = Mask.size();
18771   for (int i = 0; i != NumElts; ++i) {
18772     int M = Mask[i];
18773     assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18774            "Unexpected mask index.");
18775     if (M < 0)
18776       continue;
18777 
18778     // The first non-undef element determines our shift amount.
18779     if (ShiftAmt < 0) {
18780       ShiftAmt = M - i;
18781       // Need to be shifting right.
18782       if (ShiftAmt <= 0)
18783         return SDValue();
18784     }
18785     // All non-undef elements must shift by the same amount.
18786     if (ShiftAmt != M - i)
18787       return SDValue();
18788   }
18789   assert(ShiftAmt >= 0 && "All undef?");
18790 
18791   // Great we found a shift right.
18792   MVT WideVT = VT;
18793   if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18794     WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18795   SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18796                             DAG.getUNDEF(WideVT), V1,
18797                             DAG.getIntPtrConstant(0, DL));
18798   Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18799                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18800   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18801                      DAG.getIntPtrConstant(0, DL));
18802 }
18803 
18804 // Determine if this shuffle can be implemented with a KSHIFT instruction.
18805 // Returns the shift amount if possible or -1 if not. This is a simplified
18806 // version of matchShuffleAsShift.
18807 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18808                                     int MaskOffset, const APInt &Zeroable) {
18809   int Size = Mask.size();
18810 
18811   auto CheckZeros = [&](int Shift, bool Left) {
18812     for (int j = 0; j < Shift; ++j)
18813       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18814         return false;
18815 
18816     return true;
18817   };
18818 
18819   auto MatchShift = [&](int Shift, bool Left) {
18820     unsigned Pos = Left ? Shift : 0;
18821     unsigned Low = Left ? 0 : Shift;
18822     unsigned Len = Size - Shift;
18823     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18824   };
18825 
18826   for (int Shift = 1; Shift != Size; ++Shift)
18827     for (bool Left : {true, false})
18828       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18829         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18830         return Shift;
18831       }
18832 
18833   return -1;
18834 }
18835 
18836 
18837 // Lower vXi1 vector shuffles.
18838 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
18839 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
18840 // vector, shuffle and then truncate it back.
18841 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18842                                 MVT VT, SDValue V1, SDValue V2,
18843                                 const APInt &Zeroable,
18844                                 const X86Subtarget &Subtarget,
18845                                 SelectionDAG &DAG) {
18846   assert(Subtarget.hasAVX512() &&
18847          "Cannot lower 512-bit vectors w/o basic ISA!");
18848 
18849   int NumElts = Mask.size();
18850 
18851   // Try to recognize shuffles that are just padding a subvector with zeros.
18852   int SubvecElts = 0;
18853   int Src = -1;
18854   for (int i = 0; i != NumElts; ++i) {
18855     if (Mask[i] >= 0) {
18856       // Grab the source from the first valid mask. All subsequent elements need
18857       // to use this same source.
18858       if (Src < 0)
18859         Src = Mask[i] / NumElts;
18860       if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18861         break;
18862     }
18863 
18864     ++SubvecElts;
18865   }
18866   assert(SubvecElts != NumElts && "Identity shuffle?");
18867 
18868   // Clip to a power 2.
18869   SubvecElts = PowerOf2Floor(SubvecElts);
18870 
18871   // Make sure the number of zeroable bits in the top at least covers the bits
18872   // not covered by the subvector.
18873   if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18874     assert(Src >= 0 && "Expected a source!");
18875     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18876     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18877                                   Src == 0 ? V1 : V2,
18878                                   DAG.getIntPtrConstant(0, DL));
18879     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18880                        DAG.getConstant(0, DL, VT),
18881                        Extract, DAG.getIntPtrConstant(0, DL));
18882   }
18883 
18884   // Try a simple shift right with undef elements. Later we'll try with zeros.
18885   if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18886                                                 DAG))
18887     return Shift;
18888 
18889   // Try to match KSHIFTs.
18890   unsigned Offset = 0;
18891   for (SDValue V : { V1, V2 }) {
18892     unsigned Opcode;
18893     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18894     if (ShiftAmt >= 0) {
18895       MVT WideVT = VT;
18896       if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18897         WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18898       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18899                                 DAG.getUNDEF(WideVT), V,
18900                                 DAG.getIntPtrConstant(0, DL));
18901       // Widened right shifts need two shifts to ensure we shift in zeroes.
18902       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18903         int WideElts = WideVT.getVectorNumElements();
18904         // Shift left to put the original vector in the MSBs of the new size.
18905         Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18906                           DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18907         // Increase the shift amount to account for the left shift.
18908         ShiftAmt += WideElts - NumElts;
18909       }
18910 
18911       Res = DAG.getNode(Opcode, DL, WideVT, Res,
18912                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18913       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18914                          DAG.getIntPtrConstant(0, DL));
18915     }
18916     Offset += NumElts; // Increment for next iteration.
18917   }
18918 
18919 
18920 
18921   MVT ExtVT;
18922   switch (VT.SimpleTy) {
18923   default:
18924     llvm_unreachable("Expected a vector of i1 elements");
18925   case MVT::v2i1:
18926     ExtVT = MVT::v2i64;
18927     break;
18928   case MVT::v4i1:
18929     ExtVT = MVT::v4i32;
18930     break;
18931   case MVT::v8i1:
18932     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18933     // shuffle.
18934     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18935     break;
18936   case MVT::v16i1:
18937     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18938     // 256-bit operation available.
18939     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18940     break;
18941   case MVT::v32i1:
18942     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18943     // 256-bit operation available.
18944     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18945     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18946     break;
18947   case MVT::v64i1:
18948     // Fall back to scalarization. FIXME: We can do better if the shuffle
18949     // can be partitioned cleanly.
18950     if (!Subtarget.useBWIRegs())
18951       return SDValue();
18952     ExtVT = MVT::v64i8;
18953     break;
18954   }
18955 
18956   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18957   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18958 
18959   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18960   // i1 was sign extended we can use X86ISD::CVT2MASK.
18961   int NumElems = VT.getVectorNumElements();
18962   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18963       (Subtarget.hasDQI() && (NumElems < 32)))
18964     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18965                        Shuffle, ISD::SETGT);
18966 
18967   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18968 }
18969 
18970 /// Helper function that returns true if the shuffle mask should be
18971 /// commuted to improve canonicalization.
18972 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18973   int NumElements = Mask.size();
18974 
18975   int NumV1Elements = 0, NumV2Elements = 0;
18976   for (int M : Mask)
18977     if (M < 0)
18978       continue;
18979     else if (M < NumElements)
18980       ++NumV1Elements;
18981     else
18982       ++NumV2Elements;
18983 
18984   // Commute the shuffle as needed such that more elements come from V1 than
18985   // V2. This allows us to match the shuffle pattern strictly on how many
18986   // elements come from V1 without handling the symmetric cases.
18987   if (NumV2Elements > NumV1Elements)
18988     return true;
18989 
18990   assert(NumV1Elements > 0 && "No V1 indices");
18991 
18992   if (NumV2Elements == 0)
18993     return false;
18994 
18995   // When the number of V1 and V2 elements are the same, try to minimize the
18996   // number of uses of V2 in the low half of the vector. When that is tied,
18997   // ensure that the sum of indices for V1 is equal to or lower than the sum
18998   // indices for V2. When those are equal, try to ensure that the number of odd
18999   // indices for V1 is lower than the number of odd indices for V2.
19000   if (NumV1Elements == NumV2Elements) {
19001     int LowV1Elements = 0, LowV2Elements = 0;
19002     for (int M : Mask.slice(0, NumElements / 2))
19003       if (M >= NumElements)
19004         ++LowV2Elements;
19005       else if (M >= 0)
19006         ++LowV1Elements;
19007     if (LowV2Elements > LowV1Elements)
19008       return true;
19009     if (LowV2Elements == LowV1Elements) {
19010       int SumV1Indices = 0, SumV2Indices = 0;
19011       for (int i = 0, Size = Mask.size(); i < Size; ++i)
19012         if (Mask[i] >= NumElements)
19013           SumV2Indices += i;
19014         else if (Mask[i] >= 0)
19015           SumV1Indices += i;
19016       if (SumV2Indices < SumV1Indices)
19017         return true;
19018       if (SumV2Indices == SumV1Indices) {
19019         int NumV1OddIndices = 0, NumV2OddIndices = 0;
19020         for (int i = 0, Size = Mask.size(); i < Size; ++i)
19021           if (Mask[i] >= NumElements)
19022             NumV2OddIndices += i % 2;
19023           else if (Mask[i] >= 0)
19024             NumV1OddIndices += i % 2;
19025         if (NumV2OddIndices < NumV1OddIndices)
19026           return true;
19027       }
19028     }
19029   }
19030 
19031   return false;
19032 }
19033 
19034 // Forward declaration.
19035 static SDValue canonicalizeShuffleMaskWithHorizOp(
19036     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
19037     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
19038     const X86Subtarget &Subtarget);
19039 
19040     /// Top-level lowering for x86 vector shuffles.
19041 ///
19042 /// This handles decomposition, canonicalization, and lowering of all x86
19043 /// vector shuffles. Most of the specific lowering strategies are encapsulated
19044 /// above in helper routines. The canonicalization attempts to widen shuffles
19045 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
19046 /// s.t. only one of the two inputs needs to be tested, etc.
19047 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
19048                                    SelectionDAG &DAG) {
19049   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
19050   ArrayRef<int> OrigMask = SVOp->getMask();
19051   SDValue V1 = Op.getOperand(0);
19052   SDValue V2 = Op.getOperand(1);
19053   MVT VT = Op.getSimpleValueType();
19054   int NumElements = VT.getVectorNumElements();
19055   SDLoc DL(Op);
19056   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
19057 
19058   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
19059          "Can't lower MMX shuffles");
19060 
19061   bool V1IsUndef = V1.isUndef();
19062   bool V2IsUndef = V2.isUndef();
19063   if (V1IsUndef && V2IsUndef)
19064     return DAG.getUNDEF(VT);
19065 
19066   // When we create a shuffle node we put the UNDEF node to second operand,
19067   // but in some cases the first operand may be transformed to UNDEF.
19068   // In this case we should just commute the node.
19069   if (V1IsUndef)
19070     return DAG.getCommutedVectorShuffle(*SVOp);
19071 
19072   // Check for non-undef masks pointing at an undef vector and make the masks
19073   // undef as well. This makes it easier to match the shuffle based solely on
19074   // the mask.
19075   if (V2IsUndef &&
19076       any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
19077     SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
19078     for (int &M : NewMask)
19079       if (M >= NumElements)
19080         M = -1;
19081     return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
19082   }
19083 
19084   // Check for illegal shuffle mask element index values.
19085   int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
19086   (void)MaskUpperLimit;
19087   assert(llvm::all_of(OrigMask,
19088                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
19089          "Out of bounds shuffle index");
19090 
19091   // We actually see shuffles that are entirely re-arrangements of a set of
19092   // zero inputs. This mostly happens while decomposing complex shuffles into
19093   // simple ones. Directly lower these as a buildvector of zeros.
19094   APInt KnownUndef, KnownZero;
19095   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
19096 
19097   APInt Zeroable = KnownUndef | KnownZero;
19098   if (Zeroable.isAllOnes())
19099     return getZeroVector(VT, Subtarget, DAG, DL);
19100 
19101   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
19102 
19103   // Try to collapse shuffles into using a vector type with fewer elements but
19104   // wider element types. We cap this to not form integers or floating point
19105   // elements wider than 64 bits. It does not seem beneficial to form i128
19106   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
19107   SmallVector<int, 16> WidenedMask;
19108   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
19109       canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
19110     // Shuffle mask widening should not interfere with a broadcast opportunity
19111     // by obfuscating the operands with bitcasts.
19112     // TODO: Avoid lowering directly from this top-level function: make this
19113     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
19114     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
19115                                                     Subtarget, DAG))
19116       return Broadcast;
19117 
19118     MVT NewEltVT = VT.isFloatingPoint()
19119                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
19120                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
19121     int NewNumElts = NumElements / 2;
19122     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
19123     // Make sure that the new vector type is legal. For example, v2f64 isn't
19124     // legal on SSE1.
19125     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
19126       if (V2IsZero) {
19127         // Modify the new Mask to take all zeros from the all-zero vector.
19128         // Choose indices that are blend-friendly.
19129         bool UsedZeroVector = false;
19130         assert(is_contained(WidenedMask, SM_SentinelZero) &&
19131                "V2's non-undef elements are used?!");
19132         for (int i = 0; i != NewNumElts; ++i)
19133           if (WidenedMask[i] == SM_SentinelZero) {
19134             WidenedMask[i] = i + NewNumElts;
19135             UsedZeroVector = true;
19136           }
19137         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
19138         // some elements to be undef.
19139         if (UsedZeroVector)
19140           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
19141       }
19142       V1 = DAG.getBitcast(NewVT, V1);
19143       V2 = DAG.getBitcast(NewVT, V2);
19144       return DAG.getBitcast(
19145           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
19146     }
19147   }
19148 
19149   SmallVector<SDValue> Ops = {V1, V2};
19150   SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
19151 
19152   // Canonicalize the shuffle with any horizontal ops inputs.
19153   // NOTE: This may update Ops and Mask.
19154   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
19155           Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
19156     return DAG.getBitcast(VT, HOp);
19157 
19158   V1 = DAG.getBitcast(VT, Ops[0]);
19159   V2 = DAG.getBitcast(VT, Ops[1]);
19160   assert(NumElements == (int)Mask.size() &&
19161          "canonicalizeShuffleMaskWithHorizOp "
19162          "shouldn't alter the shuffle mask size");
19163 
19164   // Commute the shuffle if it will improve canonicalization.
19165   if (canonicalizeShuffleMaskWithCommute(Mask)) {
19166     ShuffleVectorSDNode::commuteMask(Mask);
19167     std::swap(V1, V2);
19168   }
19169 
19170   // For each vector width, delegate to a specialized lowering routine.
19171   if (VT.is128BitVector())
19172     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19173 
19174   if (VT.is256BitVector())
19175     return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19176 
19177   if (VT.is512BitVector())
19178     return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19179 
19180   if (Is1BitVector)
19181     return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19182 
19183   llvm_unreachable("Unimplemented!");
19184 }
19185 
19186 /// Try to lower a VSELECT instruction to a vector shuffle.
19187 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
19188                                            const X86Subtarget &Subtarget,
19189                                            SelectionDAG &DAG) {
19190   SDValue Cond = Op.getOperand(0);
19191   SDValue LHS = Op.getOperand(1);
19192   SDValue RHS = Op.getOperand(2);
19193   MVT VT = Op.getSimpleValueType();
19194 
19195   // Only non-legal VSELECTs reach this lowering, convert those into generic
19196   // shuffles and re-use the shuffle lowering path for blends.
19197   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
19198     SmallVector<int, 32> Mask;
19199     if (createShuffleMaskFromVSELECT(Mask, Cond))
19200       return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19201   }
19202 
19203   return SDValue();
19204 }
19205 
19206 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19207   SDValue Cond = Op.getOperand(0);
19208   SDValue LHS = Op.getOperand(1);
19209   SDValue RHS = Op.getOperand(2);
19210 
19211   // A vselect where all conditions and data are constants can be optimized into
19212   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19213   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
19214       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
19215       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
19216     return SDValue();
19217 
19218   // Try to lower this to a blend-style vector shuffle. This can handle all
19219   // constant condition cases.
19220   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19221     return BlendOp;
19222 
19223   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19224   // with patterns on the mask registers on AVX-512.
19225   MVT CondVT = Cond.getSimpleValueType();
19226   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19227   if (CondEltSize == 1)
19228     return Op;
19229 
19230   // Variable blends are only legal from SSE4.1 onward.
19231   if (!Subtarget.hasSSE41())
19232     return SDValue();
19233 
19234   SDLoc dl(Op);
19235   MVT VT = Op.getSimpleValueType();
19236   unsigned EltSize = VT.getScalarSizeInBits();
19237   unsigned NumElts = VT.getVectorNumElements();
19238 
19239   // Expand v32i16/v64i8 without BWI.
19240   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19241     return SDValue();
19242 
19243   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19244   // into an i1 condition so that we can use the mask-based 512-bit blend
19245   // instructions.
19246   if (VT.getSizeInBits() == 512) {
19247     // Build a mask by testing the condition against zero.
19248     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19249     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19250                                 DAG.getConstant(0, dl, CondVT),
19251                                 ISD::SETNE);
19252     // Now return a new VSELECT using the mask.
19253     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19254   }
19255 
19256   // SEXT/TRUNC cases where the mask doesn't match the destination size.
19257   if (CondEltSize != EltSize) {
19258     // If we don't have a sign splat, rely on the expansion.
19259     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19260       return SDValue();
19261 
19262     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19263     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19264     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19265     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19266   }
19267 
19268   // Only some types will be legal on some subtargets. If we can emit a legal
19269   // VSELECT-matching blend, return Op, and but if we need to expand, return
19270   // a null value.
19271   switch (VT.SimpleTy) {
19272   default:
19273     // Most of the vector types have blends past SSE4.1.
19274     return Op;
19275 
19276   case MVT::v32i8:
19277     // The byte blends for AVX vectors were introduced only in AVX2.
19278     if (Subtarget.hasAVX2())
19279       return Op;
19280 
19281     return SDValue();
19282 
19283   case MVT::v8i16:
19284   case MVT::v16i16: {
19285     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19286     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19287     Cond = DAG.getBitcast(CastVT, Cond);
19288     LHS = DAG.getBitcast(CastVT, LHS);
19289     RHS = DAG.getBitcast(CastVT, RHS);
19290     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19291     return DAG.getBitcast(VT, Select);
19292   }
19293   }
19294 }
19295 
19296 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
19297   MVT VT = Op.getSimpleValueType();
19298   SDValue Vec = Op.getOperand(0);
19299   SDValue Idx = Op.getOperand(1);
19300   assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
19301   SDLoc dl(Op);
19302 
19303   if (!Vec.getSimpleValueType().is128BitVector())
19304     return SDValue();
19305 
19306   if (VT.getSizeInBits() == 8) {
19307     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
19308     // we're going to zero extend the register or fold the store.
19309     if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
19310         !X86::mayFoldIntoStore(Op))
19311       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19312                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19313                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
19314 
19315     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
19316     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19317                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19318     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19319   }
19320 
19321   if (VT == MVT::f32) {
19322     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19323     // the result back to FR32 register. It's only worth matching if the
19324     // result has a single use which is a store or a bitcast to i32.  And in
19325     // the case of a store, it's not worth it if the index is a constant 0,
19326     // because a MOVSSmr can be used instead, which is smaller and faster.
19327     if (!Op.hasOneUse())
19328       return SDValue();
19329     SDNode *User = *Op.getNode()->use_begin();
19330     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19331         (User->getOpcode() != ISD::BITCAST ||
19332          User->getValueType(0) != MVT::i32))
19333       return SDValue();
19334     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19335                                   DAG.getBitcast(MVT::v4i32, Vec), Idx);
19336     return DAG.getBitcast(MVT::f32, Extract);
19337   }
19338 
19339   if (VT == MVT::i32 || VT == MVT::i64)
19340       return Op;
19341 
19342   return SDValue();
19343 }
19344 
19345 /// Extract one bit from mask vector, like v16i1 or v8i1.
19346 /// AVX-512 feature.
19347 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
19348                                         const X86Subtarget &Subtarget) {
19349   SDValue Vec = Op.getOperand(0);
19350   SDLoc dl(Vec);
19351   MVT VecVT = Vec.getSimpleValueType();
19352   SDValue Idx = Op.getOperand(1);
19353   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19354   MVT EltVT = Op.getSimpleValueType();
19355 
19356   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
19357          "Unexpected vector type in ExtractBitFromMaskVector");
19358 
19359   // variable index can't be handled in mask registers,
19360   // extend vector to VR512/128
19361   if (!IdxC) {
19362     unsigned NumElts = VecVT.getVectorNumElements();
19363     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19364     // than extending to 128/256bit.
19365     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19366     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19367     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19368     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19369     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19370   }
19371 
19372   unsigned IdxVal = IdxC->getZExtValue();
19373   if (IdxVal == 0) // the operation is legal
19374     return Op;
19375 
19376   // Extend to natively supported kshift.
19377   unsigned NumElems = VecVT.getVectorNumElements();
19378   MVT WideVecVT = VecVT;
19379   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19380     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19381     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19382                       DAG.getUNDEF(WideVecVT), Vec,
19383                       DAG.getIntPtrConstant(0, dl));
19384   }
19385 
19386   // Use kshiftr instruction to move to the lower element.
19387   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19388                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19389 
19390   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19391                      DAG.getIntPtrConstant(0, dl));
19392 }
19393 
19394 SDValue
19395 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19396                                            SelectionDAG &DAG) const {
19397   SDLoc dl(Op);
19398   SDValue Vec = Op.getOperand(0);
19399   MVT VecVT = Vec.getSimpleValueType();
19400   SDValue Idx = Op.getOperand(1);
19401   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19402 
19403   if (VecVT.getVectorElementType() == MVT::i1)
19404     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19405 
19406   if (!IdxC) {
19407     // Its more profitable to go through memory (1 cycles throughput)
19408     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
19409     // IACA tool was used to get performance estimation
19410     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19411     //
19412     // example : extractelement <16 x i8> %a, i32 %i
19413     //
19414     // Block Throughput: 3.00 Cycles
19415     // Throughput Bottleneck: Port5
19416     //
19417     // | Num Of |   Ports pressure in cycles  |    |
19418     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
19419     // ---------------------------------------------
19420     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
19421     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
19422     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
19423     // Total Num Of Uops: 4
19424     //
19425     //
19426     // Block Throughput: 1.00 Cycles
19427     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19428     //
19429     // |    |  Ports pressure in cycles   |  |
19430     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
19431     // ---------------------------------------------------------
19432     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19433     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
19434     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
19435     // Total Num Of Uops: 4
19436 
19437     return SDValue();
19438   }
19439 
19440   unsigned IdxVal = IdxC->getZExtValue();
19441 
19442   // If this is a 256-bit vector result, first extract the 128-bit vector and
19443   // then extract the element from the 128-bit vector.
19444   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19445     // Get the 128-bit vector.
19446     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19447     MVT EltVT = VecVT.getVectorElementType();
19448 
19449     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19450     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
19451 
19452     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19453     // this can be done with a mask.
19454     IdxVal &= ElemsPerChunk - 1;
19455     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19456                        DAG.getIntPtrConstant(IdxVal, dl));
19457   }
19458 
19459   assert(VecVT.is128BitVector() && "Unexpected vector length");
19460 
19461   MVT VT = Op.getSimpleValueType();
19462 
19463   if (VT == MVT::i16) {
19464     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19465     // we're going to zero extend the register or fold the store (SSE41 only).
19466     if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19467         !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19468       if (Subtarget.hasFP16())
19469         return Op;
19470 
19471       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19472                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19473                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
19474     }
19475 
19476     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19477                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19478     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19479   }
19480 
19481   if (Subtarget.hasSSE41())
19482     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19483       return Res;
19484 
19485   // TODO: We only extract a single element from v16i8, we can probably afford
19486   // to be more aggressive here before using the default approach of spilling to
19487   // stack.
19488   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
19489     // Extract either the lowest i32 or any i16, and extract the sub-byte.
19490     int DWordIdx = IdxVal / 4;
19491     if (DWordIdx == 0) {
19492       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19493                                 DAG.getBitcast(MVT::v4i32, Vec),
19494                                 DAG.getIntPtrConstant(DWordIdx, dl));
19495       int ShiftVal = (IdxVal % 4) * 8;
19496       if (ShiftVal != 0)
19497         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19498                           DAG.getConstant(ShiftVal, dl, MVT::i8));
19499       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19500     }
19501 
19502     int WordIdx = IdxVal / 2;
19503     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19504                               DAG.getBitcast(MVT::v8i16, Vec),
19505                               DAG.getIntPtrConstant(WordIdx, dl));
19506     int ShiftVal = (IdxVal % 2) * 8;
19507     if (ShiftVal != 0)
19508       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19509                         DAG.getConstant(ShiftVal, dl, MVT::i8));
19510     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19511   }
19512 
19513   if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19514     if (IdxVal == 0)
19515       return Op;
19516 
19517     // Shuffle the element to the lowest element, then movss or movsh.
19518     SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19519     Mask[0] = static_cast<int>(IdxVal);
19520     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19521     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19522                        DAG.getIntPtrConstant(0, dl));
19523   }
19524 
19525   if (VT.getSizeInBits() == 64) {
19526     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19527     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19528     //        to match extract_elt for f64.
19529     if (IdxVal == 0)
19530       return Op;
19531 
19532     // UNPCKHPD the element to the lowest double word, then movsd.
19533     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19534     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19535     int Mask[2] = { 1, -1 };
19536     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19537     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19538                        DAG.getIntPtrConstant(0, dl));
19539   }
19540 
19541   return SDValue();
19542 }
19543 
19544 /// Insert one bit to mask vector, like v16i1 or v8i1.
19545 /// AVX-512 feature.
19546 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
19547                                      const X86Subtarget &Subtarget) {
19548   SDLoc dl(Op);
19549   SDValue Vec = Op.getOperand(0);
19550   SDValue Elt = Op.getOperand(1);
19551   SDValue Idx = Op.getOperand(2);
19552   MVT VecVT = Vec.getSimpleValueType();
19553 
19554   if (!isa<ConstantSDNode>(Idx)) {
19555     // Non constant index. Extend source and destination,
19556     // insert element and then truncate the result.
19557     unsigned NumElts = VecVT.getVectorNumElements();
19558     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19559     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19560     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19561       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19562       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19563     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19564   }
19565 
19566   // Copy into a k-register, extract to v1i1 and insert_subvector.
19567   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19568   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19569 }
19570 
19571 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19572                                                   SelectionDAG &DAG) const {
19573   MVT VT = Op.getSimpleValueType();
19574   MVT EltVT = VT.getVectorElementType();
19575   unsigned NumElts = VT.getVectorNumElements();
19576   unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19577 
19578   if (EltVT == MVT::i1)
19579     return InsertBitToMaskVector(Op, DAG, Subtarget);
19580 
19581   SDLoc dl(Op);
19582   SDValue N0 = Op.getOperand(0);
19583   SDValue N1 = Op.getOperand(1);
19584   SDValue N2 = Op.getOperand(2);
19585   auto *N2C = dyn_cast<ConstantSDNode>(N2);
19586 
19587   if (!N2C) {
19588     // Variable insertion indices, usually we're better off spilling to stack,
19589     // but AVX512 can use a variable compare+select by comparing against all
19590     // possible vector indices, and FP insertion has less gpr->simd traffic.
19591     if (!(Subtarget.hasBWI() ||
19592           (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19593           (Subtarget.hasSSE41() && VT.isFloatingPoint())))
19594       return SDValue();
19595 
19596     MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19597     MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19598     if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19599       return SDValue();
19600 
19601     SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19602     SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19603     SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19604 
19605     SmallVector<SDValue, 16> RawIndices;
19606     for (unsigned I = 0; I != NumElts; ++I)
19607       RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19608     SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19609 
19610     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19611     return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19612                            ISD::CondCode::SETEQ);
19613   }
19614 
19615   if (N2C->getAPIntValue().uge(NumElts))
19616     return SDValue();
19617   uint64_t IdxVal = N2C->getZExtValue();
19618 
19619   bool IsZeroElt = X86::isZeroNode(N1);
19620   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19621 
19622   if (IsZeroElt || IsAllOnesElt) {
19623     // Lower insertion of i8 -1 as an 'OR' blend.
19624     // We don't deal with i8 0 since it appears to be handled elsewhere.
19625     if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
19626       SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19627       SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19628       SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19629       CstVectorElts[IdxVal] = OnesCst;
19630       SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19631       return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19632     }
19633     // See if we can do this more efficiently with a blend shuffle with a
19634     // rematerializable vector.
19635     if (Subtarget.hasSSE41() &&
19636         (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19637       SmallVector<int, 8> BlendMask;
19638       for (unsigned i = 0; i != NumElts; ++i)
19639         BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19640       SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19641                                     : getOnesVector(VT, DAG, dl);
19642       return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19643     }
19644   }
19645 
19646   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19647   // into that, and then insert the subvector back into the result.
19648   if (VT.is256BitVector() || VT.is512BitVector()) {
19649     // With a 256-bit vector, we can insert into the zero element efficiently
19650     // using a blend if we have AVX or AVX2 and the right data type.
19651     if (VT.is256BitVector() && IdxVal == 0) {
19652       // TODO: It is worthwhile to cast integer to floating point and back
19653       // and incur a domain crossing penalty if that's what we'll end up
19654       // doing anyway after extracting to a 128-bit vector.
19655       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19656           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19657         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19658         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19659                            DAG.getTargetConstant(1, dl, MVT::i8));
19660       }
19661     }
19662 
19663     unsigned NumEltsIn128 = 128 / EltSizeInBits;
19664     assert(isPowerOf2_32(NumEltsIn128) &&
19665            "Vectors will always have power-of-two number of elements.");
19666 
19667     // If we are not inserting into the low 128-bit vector chunk,
19668     // then prefer the broadcast+blend sequence.
19669     // FIXME: relax the profitability check iff all N1 uses are insertions.
19670     if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
19671         ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19672          (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19673           X86::mayFoldLoad(N1, Subtarget)))) {
19674       SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19675       SmallVector<int, 8> BlendMask;
19676       for (unsigned i = 0; i != NumElts; ++i)
19677         BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19678       return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19679     }
19680 
19681     // Get the desired 128-bit vector chunk.
19682     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19683 
19684     // Insert the element into the desired chunk.
19685     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19686     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19687 
19688     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19689                     DAG.getIntPtrConstant(IdxIn128, dl));
19690 
19691     // Insert the changed part back into the bigger vector
19692     return insert128BitVector(N0, V, IdxVal, DAG, dl);
19693   }
19694   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19695 
19696   // This will be just movw/movd/movq/movsh/movss/movsd.
19697   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19698     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19699         EltVT == MVT::f16 || EltVT == MVT::i64) {
19700       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19701       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19702     }
19703 
19704     // We can't directly insert an i8 or i16 into a vector, so zero extend
19705     // it to i32 first.
19706     if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19707       N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19708       MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19709       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19710       N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19711       return DAG.getBitcast(VT, N1);
19712     }
19713   }
19714 
19715   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19716   // argument. SSE41 required for pinsrb.
19717   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19718     unsigned Opc;
19719     if (VT == MVT::v8i16) {
19720       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19721       Opc = X86ISD::PINSRW;
19722     } else {
19723       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19724       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19725       Opc = X86ISD::PINSRB;
19726     }
19727 
19728     assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19729     N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19730     N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19731     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19732   }
19733 
19734   if (Subtarget.hasSSE41()) {
19735     if (EltVT == MVT::f32) {
19736       // Bits [7:6] of the constant are the source select. This will always be
19737       //   zero here. The DAG Combiner may combine an extract_elt index into
19738       //   these bits. For example (insert (extract, 3), 2) could be matched by
19739       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19740       // Bits [5:4] of the constant are the destination select. This is the
19741       //   value of the incoming immediate.
19742       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19743       //   combine either bitwise AND or insert of float 0.0 to set these bits.
19744 
19745       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19746       if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19747         // If this is an insertion of 32-bits into the low 32-bits of
19748         // a vector, we prefer to generate a blend with immediate rather
19749         // than an insertps. Blends are simpler operations in hardware and so
19750         // will always have equal or better performance than insertps.
19751         // But if optimizing for size and there's a load folding opportunity,
19752         // generate insertps because blendps does not have a 32-bit memory
19753         // operand form.
19754         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19755         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19756                            DAG.getTargetConstant(1, dl, MVT::i8));
19757       }
19758       // Create this as a scalar to vector..
19759       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19760       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19761                          DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19762     }
19763 
19764     // PINSR* works with constant index.
19765     if (EltVT == MVT::i32 || EltVT == MVT::i64)
19766       return Op;
19767   }
19768 
19769   return SDValue();
19770 }
19771 
19772 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19773                                      SelectionDAG &DAG) {
19774   SDLoc dl(Op);
19775   MVT OpVT = Op.getSimpleValueType();
19776 
19777   // It's always cheaper to replace a xor+movd with xorps and simplifies further
19778   // combines.
19779   if (X86::isZeroNode(Op.getOperand(0)))
19780     return getZeroVector(OpVT, Subtarget, DAG, dl);
19781 
19782   // If this is a 256-bit vector result, first insert into a 128-bit
19783   // vector and then insert into the 256-bit vector.
19784   if (!OpVT.is128BitVector()) {
19785     // Insert into a 128-bit vector.
19786     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19787     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19788                                  OpVT.getVectorNumElements() / SizeFactor);
19789 
19790     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19791 
19792     // Insert the 128-bit vector.
19793     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19794   }
19795   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19796          "Expected an SSE type!");
19797 
19798   // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19799   // tblgen.
19800   if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19801     return Op;
19802 
19803   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19804   return DAG.getBitcast(
19805       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19806 }
19807 
19808 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
19809 // simple superregister reference or explicit instructions to insert
19810 // the upper bits of a vector.
19811 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19812                                      SelectionDAG &DAG) {
19813   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19814 
19815   return insert1BitVector(Op, DAG, Subtarget);
19816 }
19817 
19818 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19819                                       SelectionDAG &DAG) {
19820   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19821          "Only vXi1 extract_subvectors need custom lowering");
19822 
19823   SDLoc dl(Op);
19824   SDValue Vec = Op.getOperand(0);
19825   uint64_t IdxVal = Op.getConstantOperandVal(1);
19826 
19827   if (IdxVal == 0) // the operation is legal
19828     return Op;
19829 
19830   MVT VecVT = Vec.getSimpleValueType();
19831   unsigned NumElems = VecVT.getVectorNumElements();
19832 
19833   // Extend to natively supported kshift.
19834   MVT WideVecVT = VecVT;
19835   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19836     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19837     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19838                       DAG.getUNDEF(WideVecVT), Vec,
19839                       DAG.getIntPtrConstant(0, dl));
19840   }
19841 
19842   // Shift to the LSB.
19843   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19844                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19845 
19846   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19847                      DAG.getIntPtrConstant(0, dl));
19848 }
19849 
19850 // Returns the appropriate wrapper opcode for a global reference.
19851 unsigned X86TargetLowering::getGlobalWrapperKind(
19852     const GlobalValue *GV, const unsigned char OpFlags) const {
19853   // References to absolute symbols are never PC-relative.
19854   if (GV && GV->isAbsoluteSymbolRef())
19855     return X86ISD::Wrapper;
19856 
19857   CodeModel::Model M = getTargetMachine().getCodeModel();
19858   if (Subtarget.isPICStyleRIPRel() &&
19859       (M == CodeModel::Small || M == CodeModel::Kernel))
19860     return X86ISD::WrapperRIP;
19861 
19862   // GOTPCREL references must always use RIP.
19863   if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19864     return X86ISD::WrapperRIP;
19865 
19866   return X86ISD::Wrapper;
19867 }
19868 
19869 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19870 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19871 // one of the above mentioned nodes. It has to be wrapped because otherwise
19872 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19873 // be used to form addressing mode. These wrapped nodes will be selected
19874 // into MOV32ri.
19875 SDValue
19876 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19877   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19878 
19879   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19880   // global base reg.
19881   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19882 
19883   auto PtrVT = getPointerTy(DAG.getDataLayout());
19884   SDValue Result = DAG.getTargetConstantPool(
19885       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19886   SDLoc DL(CP);
19887   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19888   // With PIC, the address is actually $g + Offset.
19889   if (OpFlag) {
19890     Result =
19891         DAG.getNode(ISD::ADD, DL, PtrVT,
19892                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19893   }
19894 
19895   return Result;
19896 }
19897 
19898 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19899   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19900 
19901   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19902   // global base reg.
19903   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19904 
19905   auto PtrVT = getPointerTy(DAG.getDataLayout());
19906   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19907   SDLoc DL(JT);
19908   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19909 
19910   // With PIC, the address is actually $g + Offset.
19911   if (OpFlag)
19912     Result =
19913         DAG.getNode(ISD::ADD, DL, PtrVT,
19914                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19915 
19916   return Result;
19917 }
19918 
19919 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19920                                                SelectionDAG &DAG) const {
19921   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19922 }
19923 
19924 SDValue
19925 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19926   // Create the TargetBlockAddressAddress node.
19927   unsigned char OpFlags =
19928     Subtarget.classifyBlockAddressReference();
19929   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19930   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19931   SDLoc dl(Op);
19932   auto PtrVT = getPointerTy(DAG.getDataLayout());
19933   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19934   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19935 
19936   // With PIC, the address is actually $g + Offset.
19937   if (isGlobalRelativeToPICBase(OpFlags)) {
19938     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19939                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19940   }
19941 
19942   return Result;
19943 }
19944 
19945 /// Creates target global address or external symbol nodes for calls or
19946 /// other uses.
19947 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19948                                                  bool ForCall) const {
19949   // Unpack the global address or external symbol.
19950   const SDLoc &dl = SDLoc(Op);
19951   const GlobalValue *GV = nullptr;
19952   int64_t Offset = 0;
19953   const char *ExternalSym = nullptr;
19954   if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19955     GV = G->getGlobal();
19956     Offset = G->getOffset();
19957   } else {
19958     const auto *ES = cast<ExternalSymbolSDNode>(Op);
19959     ExternalSym = ES->getSymbol();
19960   }
19961 
19962   // Calculate some flags for address lowering.
19963   const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19964   unsigned char OpFlags;
19965   if (ForCall)
19966     OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19967   else
19968     OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19969   bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19970   bool NeedsLoad = isGlobalStubReference(OpFlags);
19971 
19972   CodeModel::Model M = DAG.getTarget().getCodeModel();
19973   auto PtrVT = getPointerTy(DAG.getDataLayout());
19974   SDValue Result;
19975 
19976   if (GV) {
19977     // Create a target global address if this is a global. If possible, fold the
19978     // offset into the global address reference. Otherwise, ADD it on later.
19979     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19980     // allowed because if the address of foo is 0, the ELF R_X86_64_32
19981     // relocation will compute to a negative value, which is invalid.
19982     int64_t GlobalOffset = 0;
19983     if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19984         X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19985       std::swap(GlobalOffset, Offset);
19986     }
19987     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19988   } else {
19989     // If this is not a global address, this must be an external symbol.
19990     Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19991   }
19992 
19993   // If this is a direct call, avoid the wrapper if we don't need to do any
19994   // loads or adds. This allows SDAG ISel to match direct calls.
19995   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19996     return Result;
19997 
19998   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19999 
20000   // With PIC, the address is actually $g + Offset.
20001   if (HasPICReg) {
20002     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20003                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20004   }
20005 
20006   // For globals that require a load from a stub to get the address, emit the
20007   // load.
20008   if (NeedsLoad)
20009     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20010                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20011 
20012   // If there was a non-zero offset that we didn't fold, create an explicit
20013   // addition for it.
20014   if (Offset != 0)
20015     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20016                          DAG.getConstant(Offset, dl, PtrVT));
20017 
20018   return Result;
20019 }
20020 
20021 SDValue
20022 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20023   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20024 }
20025 
20026 static SDValue
20027 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
20028            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
20029            unsigned char OperandFlags, bool LocalDynamic = false) {
20030   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20031   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20032   SDLoc dl(GA);
20033   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20034                                            GA->getValueType(0),
20035                                            GA->getOffset(),
20036                                            OperandFlags);
20037 
20038   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
20039                                            : X86ISD::TLSADDR;
20040 
20041   if (InFlag) {
20042     SDValue Ops[] = { Chain,  TGA, *InFlag };
20043     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20044   } else {
20045     SDValue Ops[]  = { Chain, TGA };
20046     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20047   }
20048 
20049   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20050   MFI.setAdjustsStack(true);
20051   MFI.setHasCalls(true);
20052 
20053   SDValue Flag = Chain.getValue(1);
20054   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
20055 }
20056 
20057 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20058 static SDValue
20059 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20060                                 const EVT PtrVT) {
20061   SDValue InFlag;
20062   SDLoc dl(GA);  // ? function entry point might be better
20063   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20064                                    DAG.getNode(X86ISD::GlobalBaseReg,
20065                                                SDLoc(), PtrVT), InFlag);
20066   InFlag = Chain.getValue(1);
20067 
20068   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
20069 }
20070 
20071 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20072 static SDValue
20073 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20074                                 const EVT PtrVT) {
20075   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20076                     X86::RAX, X86II::MO_TLSGD);
20077 }
20078 
20079 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20080 static SDValue
20081 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20082                                  const EVT PtrVT) {
20083   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20084                     X86::EAX, X86II::MO_TLSGD);
20085 }
20086 
20087 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
20088                                            SelectionDAG &DAG, const EVT PtrVT,
20089                                            bool Is64Bit, bool Is64BitLP64) {
20090   SDLoc dl(GA);
20091 
20092   // Get the start address of the TLS block for this module.
20093   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
20094       .getInfo<X86MachineFunctionInfo>();
20095   MFI->incNumLocalDynamicTLSAccesses();
20096 
20097   SDValue Base;
20098   if (Is64Bit) {
20099     unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20100     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
20101                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
20102   } else {
20103     SDValue InFlag;
20104     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20105         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
20106     InFlag = Chain.getValue(1);
20107     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
20108                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
20109   }
20110 
20111   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20112   // of Base.
20113 
20114   // Build x@dtpoff.
20115   unsigned char OperandFlags = X86II::MO_DTPOFF;
20116   unsigned WrapperKind = X86ISD::Wrapper;
20117   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20118                                            GA->getValueType(0),
20119                                            GA->getOffset(), OperandFlags);
20120   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20121 
20122   // Add x@dtpoff with the base.
20123   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20124 }
20125 
20126 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20127 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20128                                    const EVT PtrVT, TLSModel::Model model,
20129                                    bool is64Bit, bool isPIC) {
20130   SDLoc dl(GA);
20131 
20132   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20133   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
20134                                                          is64Bit ? 257 : 256));
20135 
20136   SDValue ThreadPointer =
20137       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20138                   MachinePointerInfo(Ptr));
20139 
20140   unsigned char OperandFlags = 0;
20141   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
20142   // initialexec.
20143   unsigned WrapperKind = X86ISD::Wrapper;
20144   if (model == TLSModel::LocalExec) {
20145     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20146   } else if (model == TLSModel::InitialExec) {
20147     if (is64Bit) {
20148       OperandFlags = X86II::MO_GOTTPOFF;
20149       WrapperKind = X86ISD::WrapperRIP;
20150     } else {
20151       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20152     }
20153   } else {
20154     llvm_unreachable("Unexpected model");
20155   }
20156 
20157   // emit "addl x@ntpoff,%eax" (local exec)
20158   // or "addl x@indntpoff,%eax" (initial exec)
20159   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20160   SDValue TGA =
20161       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20162                                  GA->getOffset(), OperandFlags);
20163   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20164 
20165   if (model == TLSModel::InitialExec) {
20166     if (isPIC && !is64Bit) {
20167       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20168                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20169                            Offset);
20170     }
20171 
20172     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20173                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20174   }
20175 
20176   // The address of the thread local variable is the add of the thread
20177   // pointer with the offset of the variable.
20178   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20179 }
20180 
20181 SDValue
20182 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20183 
20184   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20185 
20186   if (DAG.getTarget().useEmulatedTLS())
20187     return LowerToTLSEmulatedModel(GA, DAG);
20188 
20189   const GlobalValue *GV = GA->getGlobal();
20190   auto PtrVT = getPointerTy(DAG.getDataLayout());
20191   bool PositionIndependent = isPositionIndependent();
20192 
20193   if (Subtarget.isTargetELF()) {
20194     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20195     switch (model) {
20196       case TLSModel::GeneralDynamic:
20197         if (Subtarget.is64Bit()) {
20198           if (Subtarget.isTarget64BitLP64())
20199             return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20200           return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20201         }
20202         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20203       case TLSModel::LocalDynamic:
20204         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20205                                            Subtarget.isTarget64BitLP64());
20206       case TLSModel::InitialExec:
20207       case TLSModel::LocalExec:
20208         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20209                                    PositionIndependent);
20210     }
20211     llvm_unreachable("Unknown TLS model.");
20212   }
20213 
20214   if (Subtarget.isTargetDarwin()) {
20215     // Darwin only has one model of TLS.  Lower to that.
20216     unsigned char OpFlag = 0;
20217     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
20218                            X86ISD::WrapperRIP : X86ISD::Wrapper;
20219 
20220     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20221     // global base reg.
20222     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20223     if (PIC32)
20224       OpFlag = X86II::MO_TLVP_PIC_BASE;
20225     else
20226       OpFlag = X86II::MO_TLVP;
20227     SDLoc DL(Op);
20228     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
20229                                                 GA->getValueType(0),
20230                                                 GA->getOffset(), OpFlag);
20231     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20232 
20233     // With PIC32, the address is actually $g + Offset.
20234     if (PIC32)
20235       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20236                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20237                            Offset);
20238 
20239     // Lowering the machine isd will make sure everything is in the right
20240     // location.
20241     SDValue Chain = DAG.getEntryNode();
20242     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20243     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20244     SDValue Args[] = { Chain, Offset };
20245     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20246     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
20247                                DAG.getIntPtrConstant(0, DL, true),
20248                                Chain.getValue(1), DL);
20249 
20250     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20251     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20252     MFI.setAdjustsStack(true);
20253 
20254     // And our return value (tls address) is in the standard call return value
20255     // location.
20256     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20257     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20258   }
20259 
20260   if (Subtarget.isOSWindows()) {
20261     // Just use the implicit TLS architecture
20262     // Need to generate something similar to:
20263     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20264     //                                  ; from TEB
20265     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
20266     //   mov     rcx, qword [rdx+rcx*8]
20267     //   mov     eax, .tls$:tlsvar
20268     //   [rax+rcx] contains the address
20269     // Windows 64bit: gs:0x58
20270     // Windows 32bit: fs:__tls_array
20271 
20272     SDLoc dl(GA);
20273     SDValue Chain = DAG.getEntryNode();
20274 
20275     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20276     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20277     // use its literal value of 0x2C.
20278     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
20279                                         ? Type::getInt8PtrTy(*DAG.getContext(),
20280                                                              256)
20281                                         : Type::getInt32PtrTy(*DAG.getContext(),
20282                                                               257));
20283 
20284     SDValue TlsArray = Subtarget.is64Bit()
20285                            ? DAG.getIntPtrConstant(0x58, dl)
20286                            : (Subtarget.isTargetWindowsGNU()
20287                                   ? DAG.getIntPtrConstant(0x2C, dl)
20288                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
20289 
20290     SDValue ThreadPointer =
20291         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20292 
20293     SDValue res;
20294     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
20295       res = ThreadPointer;
20296     } else {
20297       // Load the _tls_index variable
20298       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20299       if (Subtarget.is64Bit())
20300         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20301                              MachinePointerInfo(), MVT::i32);
20302       else
20303         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20304 
20305       const DataLayout &DL = DAG.getDataLayout();
20306       SDValue Scale =
20307           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20308       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20309 
20310       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20311     }
20312 
20313     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20314 
20315     // Get the offset of start of .tls section
20316     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20317                                              GA->getValueType(0),
20318                                              GA->getOffset(), X86II::MO_SECREL);
20319     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20320 
20321     // The address of the thread local variable is the add of the thread
20322     // pointer with the offset of the variable.
20323     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20324   }
20325 
20326   llvm_unreachable("TLS not implemented for this target.");
20327 }
20328 
20329 /// Lower SRA_PARTS and friends, which return two i32 values
20330 /// and take a 2 x i32 value to shift plus a shift amount.
20331 /// TODO: Can this be moved to general expansion code?
20332 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
20333   SDValue Lo, Hi;
20334   DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20335   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20336 }
20337 
20338 // Try to use a packed vector operation to handle i64 on 32-bit targets when
20339 // AVX512DQ is enabled.
20340 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
20341                                         const X86Subtarget &Subtarget) {
20342   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20343           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20344           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20345           Op.getOpcode() == ISD::UINT_TO_FP) &&
20346          "Unexpected opcode!");
20347   bool IsStrict = Op->isStrictFPOpcode();
20348   unsigned OpNo = IsStrict ? 1 : 0;
20349   SDValue Src = Op.getOperand(OpNo);
20350   MVT SrcVT = Src.getSimpleValueType();
20351   MVT VT = Op.getSimpleValueType();
20352 
20353    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20354        (VT != MVT::f32 && VT != MVT::f64))
20355     return SDValue();
20356 
20357   // Pack the i64 into a vector, do the operation and extract.
20358 
20359   // Using 256-bit to ensure result is 128-bits for f32 case.
20360   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20361   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20362   MVT VecVT = MVT::getVectorVT(VT, NumElts);
20363 
20364   SDLoc dl(Op);
20365   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20366   if (IsStrict) {
20367     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20368                                  {Op.getOperand(0), InVec});
20369     SDValue Chain = CvtVec.getValue(1);
20370     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20371                                 DAG.getIntPtrConstant(0, dl));
20372     return DAG.getMergeValues({Value, Chain}, dl);
20373   }
20374 
20375   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20376 
20377   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20378                      DAG.getIntPtrConstant(0, dl));
20379 }
20380 
20381 // Try to use a packed vector operation to handle i64 on 32-bit targets.
20382 static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
20383                                  const X86Subtarget &Subtarget) {
20384   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20385           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20386           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20387           Op.getOpcode() == ISD::UINT_TO_FP) &&
20388          "Unexpected opcode!");
20389   bool IsStrict = Op->isStrictFPOpcode();
20390   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20391   MVT SrcVT = Src.getSimpleValueType();
20392   MVT VT = Op.getSimpleValueType();
20393 
20394   if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20395     return SDValue();
20396 
20397   // Pack the i64 into a vector, do the operation and extract.
20398 
20399   assert(Subtarget.hasFP16() && "Expected FP16");
20400 
20401   SDLoc dl(Op);
20402   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20403   if (IsStrict) {
20404     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20405                                  {Op.getOperand(0), InVec});
20406     SDValue Chain = CvtVec.getValue(1);
20407     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20408                                 DAG.getIntPtrConstant(0, dl));
20409     return DAG.getMergeValues({Value, Chain}, dl);
20410   }
20411 
20412   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20413 
20414   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20415                      DAG.getIntPtrConstant(0, dl));
20416 }
20417 
20418 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20419                           const X86Subtarget &Subtarget) {
20420   switch (Opcode) {
20421     case ISD::SINT_TO_FP:
20422       // TODO: Handle wider types with AVX/AVX512.
20423       if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20424         return false;
20425       // CVTDQ2PS or (V)CVTDQ2PD
20426       return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20427 
20428     case ISD::UINT_TO_FP:
20429       // TODO: Handle wider types and i64 elements.
20430       if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20431         return false;
20432       // VCVTUDQ2PS or VCVTUDQ2PD
20433       return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20434 
20435     default:
20436       return false;
20437   }
20438 }
20439 
20440 /// Given a scalar cast operation that is extracted from a vector, try to
20441 /// vectorize the cast op followed by extraction. This will avoid an expensive
20442 /// round-trip between XMM and GPR.
20443 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
20444                                       const X86Subtarget &Subtarget) {
20445   // TODO: This could be enhanced to handle smaller integer types by peeking
20446   // through an extend.
20447   SDValue Extract = Cast.getOperand(0);
20448   MVT DestVT = Cast.getSimpleValueType();
20449   if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20450       !isa<ConstantSDNode>(Extract.getOperand(1)))
20451     return SDValue();
20452 
20453   // See if we have a 128-bit vector cast op for this type of cast.
20454   SDValue VecOp = Extract.getOperand(0);
20455   MVT FromVT = VecOp.getSimpleValueType();
20456   unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20457   MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
20458   MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20459   if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20460     return SDValue();
20461 
20462   // If we are extracting from a non-zero element, first shuffle the source
20463   // vector to allow extracting from element zero.
20464   SDLoc DL(Cast);
20465   if (!isNullConstant(Extract.getOperand(1))) {
20466     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20467     Mask[0] = Extract.getConstantOperandVal(1);
20468     VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20469   }
20470   // If the source vector is wider than 128-bits, extract the low part. Do not
20471   // create an unnecessarily wide vector cast op.
20472   if (FromVT != Vec128VT)
20473     VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20474 
20475   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20476   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20477   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20478   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20479                      DAG.getIntPtrConstant(0, DL));
20480 }
20481 
20482 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20483 /// try to vectorize the cast ops. This will avoid an expensive round-trip
20484 /// between XMM and GPR.
20485 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
20486                                 const X86Subtarget &Subtarget) {
20487   // TODO: Allow FP_TO_UINT.
20488   SDValue CastToInt = CastToFP.getOperand(0);
20489   MVT VT = CastToFP.getSimpleValueType();
20490   if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
20491     return SDValue();
20492 
20493   MVT IntVT = CastToInt.getSimpleValueType();
20494   SDValue X = CastToInt.getOperand(0);
20495   MVT SrcVT = X.getSimpleValueType();
20496   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20497     return SDValue();
20498 
20499   // See if we have 128-bit vector cast instructions for this type of cast.
20500   // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20501   if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20502       IntVT != MVT::i32)
20503     return SDValue();
20504 
20505   unsigned SrcSize = SrcVT.getSizeInBits();
20506   unsigned IntSize = IntVT.getSizeInBits();
20507   unsigned VTSize = VT.getSizeInBits();
20508   MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
20509   MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
20510   MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
20511 
20512   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
20513   unsigned ToIntOpcode =
20514       SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20515   unsigned ToFPOpcode =
20516       IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20517 
20518   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20519   //
20520   // We are not defining the high elements (for example, zero them) because
20521   // that could nullify any performance advantage that we hoped to gain from
20522   // this vector op hack. We do not expect any adverse effects (like denorm
20523   // penalties) with cast ops.
20524   SDLoc DL(CastToFP);
20525   SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
20526   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20527   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20528   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20529   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20530 }
20531 
20532 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
20533                                     const X86Subtarget &Subtarget) {
20534   SDLoc DL(Op);
20535   bool IsStrict = Op->isStrictFPOpcode();
20536   MVT VT = Op->getSimpleValueType(0);
20537   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20538 
20539   if (Subtarget.hasDQI()) {
20540     assert(!Subtarget.hasVLX() && "Unexpected features");
20541 
20542     assert((Src.getSimpleValueType() == MVT::v2i64 ||
20543             Src.getSimpleValueType() == MVT::v4i64) &&
20544            "Unsupported custom type");
20545 
20546     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20547     assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20548            "Unexpected VT!");
20549     MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20550 
20551     // Need to concat with zero vector for strict fp to avoid spurious
20552     // exceptions.
20553     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20554                            : DAG.getUNDEF(MVT::v8i64);
20555     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20556                       DAG.getIntPtrConstant(0, DL));
20557     SDValue Res, Chain;
20558     if (IsStrict) {
20559       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20560                         {Op->getOperand(0), Src});
20561       Chain = Res.getValue(1);
20562     } else {
20563       Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20564     }
20565 
20566     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20567                       DAG.getIntPtrConstant(0, DL));
20568 
20569     if (IsStrict)
20570       return DAG.getMergeValues({Res, Chain}, DL);
20571     return Res;
20572   }
20573 
20574   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20575                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20576   if (VT != MVT::v4f32 || IsSigned)
20577     return SDValue();
20578 
20579   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20580   SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
20581   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20582                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20583                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20584   SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20585   SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20586   SmallVector<SDValue, 4> SignCvts(4);
20587   SmallVector<SDValue, 4> Chains(4);
20588   for (int i = 0; i != 4; ++i) {
20589     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20590                               DAG.getIntPtrConstant(i, DL));
20591     if (IsStrict) {
20592       SignCvts[i] =
20593           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20594                       {Op.getOperand(0), Elt});
20595       Chains[i] = SignCvts[i].getValue(1);
20596     } else {
20597       SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20598     }
20599   }
20600   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20601 
20602   SDValue Slow, Chain;
20603   if (IsStrict) {
20604     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20605     Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20606                        {Chain, SignCvt, SignCvt});
20607     Chain = Slow.getValue(1);
20608   } else {
20609     Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20610   }
20611 
20612   IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20613   SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20614 
20615   if (IsStrict)
20616     return DAG.getMergeValues({Cvt, Chain}, DL);
20617 
20618   return Cvt;
20619 }
20620 
20621 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20622                                            SelectionDAG &DAG) const {
20623   bool IsStrict = Op->isStrictFPOpcode();
20624   unsigned OpNo = IsStrict ? 1 : 0;
20625   SDValue Src = Op.getOperand(OpNo);
20626   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20627   MVT SrcVT = Src.getSimpleValueType();
20628   MVT VT = Op.getSimpleValueType();
20629   SDLoc dl(Op);
20630 
20631   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20632     return LowerWin64_INT128_TO_FP(Op, DAG);
20633 
20634   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20635     return Extract;
20636 
20637   if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20638     return R;
20639 
20640   if (SrcVT.isVector()) {
20641     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20642       // Note: Since v2f64 is a legal type. We don't need to zero extend the
20643       // source for strict FP.
20644       if (IsStrict)
20645         return DAG.getNode(
20646             X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20647             {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20648                                 DAG.getUNDEF(SrcVT))});
20649       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20650                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20651                                      DAG.getUNDEF(SrcVT)));
20652     }
20653     if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20654       return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20655 
20656     return SDValue();
20657   }
20658 
20659   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20660          "Unknown SINT_TO_FP to lower!");
20661 
20662   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20663 
20664   // These are really Legal; return the operand so the caller accepts it as
20665   // Legal.
20666   if (SrcVT == MVT::i32 && UseSSEReg)
20667     return Op;
20668   if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20669     return Op;
20670 
20671   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20672     return V;
20673   if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20674     return V;
20675 
20676   // SSE doesn't have an i16 conversion so we need to promote.
20677   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20678     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20679     if (IsStrict)
20680       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20681                          {Chain, Ext});
20682 
20683     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20684   }
20685 
20686   if (VT == MVT::f128)
20687     return SDValue();
20688 
20689   SDValue ValueToStore = Src;
20690   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20691     // Bitcasting to f64 here allows us to do a single 64-bit store from
20692     // an SSE register, avoiding the store forwarding penalty that would come
20693     // with two 32-bit stores.
20694     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20695 
20696   unsigned Size = SrcVT.getStoreSize();
20697   Align Alignment(Size);
20698   MachineFunction &MF = DAG.getMachineFunction();
20699   auto PtrVT = getPointerTy(MF.getDataLayout());
20700   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20701   MachinePointerInfo MPI =
20702       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20703   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20704   Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20705   std::pair<SDValue, SDValue> Tmp =
20706       BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20707 
20708   if (IsStrict)
20709     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20710 
20711   return Tmp.first;
20712 }
20713 
20714 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20715     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20716     MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20717   // Build the FILD
20718   SDVTList Tys;
20719   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20720   if (useSSE)
20721     Tys = DAG.getVTList(MVT::f80, MVT::Other);
20722   else
20723     Tys = DAG.getVTList(DstVT, MVT::Other);
20724 
20725   SDValue FILDOps[] = {Chain, Pointer};
20726   SDValue Result =
20727       DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20728                               Alignment, MachineMemOperand::MOLoad);
20729   Chain = Result.getValue(1);
20730 
20731   if (useSSE) {
20732     MachineFunction &MF = DAG.getMachineFunction();
20733     unsigned SSFISize = DstVT.getStoreSize();
20734     int SSFI =
20735         MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20736     auto PtrVT = getPointerTy(MF.getDataLayout());
20737     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20738     Tys = DAG.getVTList(MVT::Other);
20739     SDValue FSTOps[] = {Chain, Result, StackSlot};
20740     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20741         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20742         MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20743 
20744     Chain =
20745         DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20746     Result = DAG.getLoad(
20747         DstVT, DL, Chain, StackSlot,
20748         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20749     Chain = Result.getValue(1);
20750   }
20751 
20752   return { Result, Chain };
20753 }
20754 
20755 /// Horizontal vector math instructions may be slower than normal math with
20756 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20757 /// implementation, and likely shuffle complexity of the alternate sequence.
20758 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20759                                   const X86Subtarget &Subtarget) {
20760   bool IsOptimizingSize = DAG.shouldOptForSize();
20761   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20762   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20763 }
20764 
20765 /// 64-bit unsigned integer to double expansion.
20766 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20767                                    const X86Subtarget &Subtarget) {
20768   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20769   // when converting 0 when rounding toward negative infinity. Caller will
20770   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20771   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20772   // This algorithm is not obvious. Here it is what we're trying to output:
20773   /*
20774      movq       %rax,  %xmm0
20775      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20776      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20777      #ifdef __SSE3__
20778        haddpd   %xmm0, %xmm0
20779      #else
20780        pshufd   $0x4e, %xmm0, %xmm1
20781        addpd    %xmm1, %xmm0
20782      #endif
20783   */
20784 
20785   SDLoc dl(Op);
20786   LLVMContext *Context = DAG.getContext();
20787 
20788   // Build some magic constants.
20789   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20790   Constant *C0 = ConstantDataVector::get(*Context, CV0);
20791   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20792   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20793 
20794   SmallVector<Constant*,2> CV1;
20795   CV1.push_back(
20796     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20797                                       APInt(64, 0x4330000000000000ULL))));
20798   CV1.push_back(
20799     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20800                                       APInt(64, 0x4530000000000000ULL))));
20801   Constant *C1 = ConstantVector::get(CV1);
20802   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20803 
20804   // Load the 64-bit value into an XMM register.
20805   SDValue XR1 =
20806       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20807   SDValue CLod0 = DAG.getLoad(
20808       MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20809       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20810   SDValue Unpck1 =
20811       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20812 
20813   SDValue CLod1 = DAG.getLoad(
20814       MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20815       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20816   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20817   // TODO: Are there any fast-math-flags to propagate here?
20818   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20819   SDValue Result;
20820 
20821   if (Subtarget.hasSSE3() &&
20822       shouldUseHorizontalOp(true, DAG, Subtarget)) {
20823     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20824   } else {
20825     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20826     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20827   }
20828   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20829                        DAG.getIntPtrConstant(0, dl));
20830   return Result;
20831 }
20832 
20833 /// 32-bit unsigned integer to float expansion.
20834 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20835                                    const X86Subtarget &Subtarget) {
20836   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20837   SDLoc dl(Op);
20838   // FP constant to bias correct the final result.
20839   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20840                                    MVT::f64);
20841 
20842   // Load the 32-bit value into an XMM register.
20843   SDValue Load =
20844       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20845 
20846   // Zero out the upper parts of the register.
20847   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20848 
20849   // Or the load with the bias.
20850   SDValue Or = DAG.getNode(
20851       ISD::OR, dl, MVT::v2i64,
20852       DAG.getBitcast(MVT::v2i64, Load),
20853       DAG.getBitcast(MVT::v2i64,
20854                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20855   Or =
20856       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20857                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20858 
20859   if (Op.getNode()->isStrictFPOpcode()) {
20860     // Subtract the bias.
20861     // TODO: Are there any fast-math-flags to propagate here?
20862     SDValue Chain = Op.getOperand(0);
20863     SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20864                               {Chain, Or, Bias});
20865 
20866     if (Op.getValueType() == Sub.getValueType())
20867       return Sub;
20868 
20869     // Handle final rounding.
20870     std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20871         Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20872 
20873     return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20874   }
20875 
20876   // Subtract the bias.
20877   // TODO: Are there any fast-math-flags to propagate here?
20878   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20879 
20880   // Handle final rounding.
20881   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20882 }
20883 
20884 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20885                                      const X86Subtarget &Subtarget,
20886                                      const SDLoc &DL) {
20887   if (Op.getSimpleValueType() != MVT::v2f64)
20888     return SDValue();
20889 
20890   bool IsStrict = Op->isStrictFPOpcode();
20891 
20892   SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20893   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20894 
20895   if (Subtarget.hasAVX512()) {
20896     if (!Subtarget.hasVLX()) {
20897       // Let generic type legalization widen this.
20898       if (!IsStrict)
20899         return SDValue();
20900       // Otherwise pad the integer input with 0s and widen the operation.
20901       N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20902                        DAG.getConstant(0, DL, MVT::v2i32));
20903       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20904                                 {Op.getOperand(0), N0});
20905       SDValue Chain = Res.getValue(1);
20906       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20907                         DAG.getIntPtrConstant(0, DL));
20908       return DAG.getMergeValues({Res, Chain}, DL);
20909     }
20910 
20911     // Legalize to v4i32 type.
20912     N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20913                      DAG.getUNDEF(MVT::v2i32));
20914     if (IsStrict)
20915       return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20916                          {Op.getOperand(0), N0});
20917     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20918   }
20919 
20920   // Zero extend to 2i64, OR with the floating point representation of 2^52.
20921   // This gives us the floating point equivalent of 2^52 + the i32 integer
20922   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20923   // point leaving just our i32 integers in double format.
20924   SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20925   SDValue VBias =
20926       DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20927   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20928                            DAG.getBitcast(MVT::v2i64, VBias));
20929   Or = DAG.getBitcast(MVT::v2f64, Or);
20930 
20931   if (IsStrict)
20932     return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20933                        {Op.getOperand(0), Or, VBias});
20934   return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20935 }
20936 
20937 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20938                                      const X86Subtarget &Subtarget) {
20939   SDLoc DL(Op);
20940   bool IsStrict = Op->isStrictFPOpcode();
20941   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20942   MVT VecIntVT = V.getSimpleValueType();
20943   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20944          "Unsupported custom type");
20945 
20946   if (Subtarget.hasAVX512()) {
20947     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20948     assert(!Subtarget.hasVLX() && "Unexpected features");
20949     MVT VT = Op->getSimpleValueType(0);
20950 
20951     // v8i32->v8f64 is legal with AVX512 so just return it.
20952     if (VT == MVT::v8f64)
20953       return Op;
20954 
20955     assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20956            "Unexpected VT!");
20957     MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20958     MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20959     // Need to concat with zero vector for strict fp to avoid spurious
20960     // exceptions.
20961     SDValue Tmp =
20962         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20963     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20964                     DAG.getIntPtrConstant(0, DL));
20965     SDValue Res, Chain;
20966     if (IsStrict) {
20967       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20968                         {Op->getOperand(0), V});
20969       Chain = Res.getValue(1);
20970     } else {
20971       Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20972     }
20973 
20974     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20975                       DAG.getIntPtrConstant(0, DL));
20976 
20977     if (IsStrict)
20978       return DAG.getMergeValues({Res, Chain}, DL);
20979     return Res;
20980   }
20981 
20982   if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20983       Op->getSimpleValueType(0) == MVT::v4f64) {
20984     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20985     Constant *Bias = ConstantFP::get(
20986         *DAG.getContext(),
20987         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20988     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20989     SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20990     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20991     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20992     SDValue VBias = DAG.getMemIntrinsicNode(
20993         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20994         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20995         MachineMemOperand::MOLoad);
20996 
20997     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20998                              DAG.getBitcast(MVT::v4i64, VBias));
20999     Or = DAG.getBitcast(MVT::v4f64, Or);
21000 
21001     if (IsStrict)
21002       return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21003                          {Op.getOperand(0), Or, VBias});
21004     return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21005   }
21006 
21007   // The algorithm is the following:
21008   // #ifdef __SSE4_1__
21009   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21010   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21011   //                                 (uint4) 0x53000000, 0xaa);
21012   // #else
21013   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21014   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
21015   // #endif
21016   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21017   //     return (float4) lo + fhi;
21018 
21019   bool Is128 = VecIntVT == MVT::v4i32;
21020   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21021   // If we convert to something else than the supported type, e.g., to v4f64,
21022   // abort early.
21023   if (VecFloatVT != Op->getSimpleValueType(0))
21024     return SDValue();
21025 
21026   // In the #idef/#else code, we have in common:
21027   // - The vector of constants:
21028   // -- 0x4b000000
21029   // -- 0x53000000
21030   // - A shift:
21031   // -- v >> 16
21032 
21033   // Create the splat vector for 0x4b000000.
21034   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21035   // Create the splat vector for 0x53000000.
21036   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21037 
21038   // Create the right shift.
21039   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21040   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21041 
21042   SDValue Low, High;
21043   if (Subtarget.hasSSE41()) {
21044     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21045     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21046     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21047     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21048     // Low will be bitcasted right away, so do not bother bitcasting back to its
21049     // original type.
21050     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21051                       VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21052     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21053     //                                 (uint4) 0x53000000, 0xaa);
21054     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21055     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21056     // High will be bitcasted right away, so do not bother bitcasting back to
21057     // its original type.
21058     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21059                        VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21060   } else {
21061     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21062     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21063     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21064     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21065 
21066     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
21067     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21068   }
21069 
21070   // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21071   SDValue VecCstFSub = DAG.getConstantFP(
21072       APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21073 
21074   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21075   // NOTE: By using fsub of a positive constant instead of fadd of a negative
21076   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
21077   // enabled. See PR24512.
21078   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21079   // TODO: Are there any fast-math-flags to propagate here?
21080   //     (float4) lo;
21081   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21082   //     return (float4) lo + fhi;
21083   if (IsStrict) {
21084     SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21085                                 {Op.getOperand(0), HighBitcast, VecCstFSub});
21086     return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21087                        {FHigh.getValue(1), LowBitcast, FHigh});
21088   }
21089 
21090   SDValue FHigh =
21091       DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21092   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21093 }
21094 
21095 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
21096                                    const X86Subtarget &Subtarget) {
21097   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21098   SDValue N0 = Op.getOperand(OpNo);
21099   MVT SrcVT = N0.getSimpleValueType();
21100   SDLoc dl(Op);
21101 
21102   switch (SrcVT.SimpleTy) {
21103   default:
21104     llvm_unreachable("Custom UINT_TO_FP is not supported!");
21105   case MVT::v2i32:
21106     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
21107   case MVT::v4i32:
21108   case MVT::v8i32:
21109     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
21110   case MVT::v2i64:
21111   case MVT::v4i64:
21112     return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21113   }
21114 }
21115 
21116 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21117                                            SelectionDAG &DAG) const {
21118   bool IsStrict = Op->isStrictFPOpcode();
21119   unsigned OpNo = IsStrict ? 1 : 0;
21120   SDValue Src = Op.getOperand(OpNo);
21121   SDLoc dl(Op);
21122   auto PtrVT = getPointerTy(DAG.getDataLayout());
21123   MVT SrcVT = Src.getSimpleValueType();
21124   MVT DstVT = Op->getSimpleValueType(0);
21125   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21126 
21127   if (DstVT == MVT::f128)
21128     return SDValue();
21129 
21130   if (DstVT.isVector())
21131     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
21132 
21133   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21134     return LowerWin64_INT128_TO_FP(Op, DAG);
21135 
21136   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21137     return Extract;
21138 
21139   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21140       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21141     // Conversions from unsigned i32 to f32/f64 are legal,
21142     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
21143     return Op;
21144   }
21145 
21146   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21147   if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21148     Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21149     if (IsStrict)
21150       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21151                          {Chain, Src});
21152     return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21153   }
21154 
21155   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21156     return V;
21157   if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21158     return V;
21159 
21160   // The transform for i64->f64 isn't correct for 0 when rounding to negative
21161   // infinity. It produces -0.0, so disable under strictfp.
21162   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
21163     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
21164   if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
21165     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
21166   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21167       (DstVT == MVT::f32 || DstVT == MVT::f64))
21168     return SDValue();
21169 
21170   // Make a 64-bit buffer, and use it to build an FILD.
21171   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21172   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21173   Align SlotAlign(8);
21174   MachinePointerInfo MPI =
21175     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21176   if (SrcVT == MVT::i32) {
21177     SDValue OffsetSlot =
21178         DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
21179     SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21180     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21181                                   OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21182     std::pair<SDValue, SDValue> Tmp =
21183         BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21184     if (IsStrict)
21185       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21186 
21187     return Tmp.first;
21188   }
21189 
21190   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
21191   SDValue ValueToStore = Src;
21192   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21193     // Bitcasting to f64 here allows us to do a single 64-bit store from
21194     // an SSE register, avoiding the store forwarding penalty that would come
21195     // with two 32-bit stores.
21196     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21197   }
21198   SDValue Store =
21199       DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21200   // For i64 source, we need to add the appropriate power of 2 if the input
21201   // was negative. We must be careful to do the computation in x87 extended
21202   // precision, not in SSE.
21203   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21204   SDValue Ops[] = { Store, StackSlot };
21205   SDValue Fild =
21206       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21207                               SlotAlign, MachineMemOperand::MOLoad);
21208   Chain = Fild.getValue(1);
21209 
21210 
21211   // Check whether the sign bit is set.
21212   SDValue SignSet = DAG.getSetCC(
21213       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21214       Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21215 
21216   // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21217   APInt FF(64, 0x5F80000000000000ULL);
21218   SDValue FudgePtr = DAG.getConstantPool(
21219       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21220   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21221 
21222   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21223   SDValue Zero = DAG.getIntPtrConstant(0, dl);
21224   SDValue Four = DAG.getIntPtrConstant(4, dl);
21225   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21226   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21227 
21228   // Load the value out, extending it from f32 to f80.
21229   SDValue Fudge = DAG.getExtLoad(
21230       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21231       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
21232       CPAlignment);
21233   Chain = Fudge.getValue(1);
21234   // Extend everything to 80 bits to force it to be done on x87.
21235   // TODO: Are there any fast-math-flags to propagate here?
21236   if (IsStrict) {
21237     SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
21238                               {Chain, Fild, Fudge});
21239     // STRICT_FP_ROUND can't handle equal types.
21240     if (DstVT == MVT::f80)
21241       return Add;
21242     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21243                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
21244   }
21245   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
21246   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21247                      DAG.getIntPtrConstant(0, dl));
21248 }
21249 
21250 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21251 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21252 // just return an SDValue().
21253 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21254 // to i16, i32 or i64, and we lower it to a legal sequence and return the
21255 // result.
21256 SDValue
21257 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21258                                    bool IsSigned, SDValue &Chain) const {
21259   bool IsStrict = Op->isStrictFPOpcode();
21260   SDLoc DL(Op);
21261 
21262   EVT DstTy = Op.getValueType();
21263   SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21264   EVT TheVT = Value.getValueType();
21265   auto PtrVT = getPointerTy(DAG.getDataLayout());
21266 
21267   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21268     // f16 must be promoted before using the lowering in this routine.
21269     // fp128 does not use this lowering.
21270     return SDValue();
21271   }
21272 
21273   // If using FIST to compute an unsigned i64, we'll need some fixup
21274   // to handle values above the maximum signed i64.  A FIST is always
21275   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21276   bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21277 
21278   // FIXME: This does not generate an invalid exception if the input does not
21279   // fit in i32. PR44019
21280   if (!IsSigned && DstTy != MVT::i64) {
21281     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21282     // The low 32 bits of the fist result will have the correct uint32 result.
21283     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
21284     DstTy = MVT::i64;
21285   }
21286 
21287   assert(DstTy.getSimpleVT() <= MVT::i64 &&
21288          DstTy.getSimpleVT() >= MVT::i16 &&
21289          "Unknown FP_TO_INT to lower!");
21290 
21291   // We lower FP->int64 into FISTP64 followed by a load from a temporary
21292   // stack slot.
21293   MachineFunction &MF = DAG.getMachineFunction();
21294   unsigned MemSize = DstTy.getStoreSize();
21295   int SSFI =
21296       MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21297   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21298 
21299   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21300 
21301   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21302 
21303   if (UnsignedFixup) {
21304     //
21305     // Conversion to unsigned i64 is implemented with a select,
21306     // depending on whether the source value fits in the range
21307     // of a signed i64.  Let Thresh be the FP equivalent of
21308     // 0x8000000000000000ULL.
21309     //
21310     //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21311     //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21312     //  FistSrc = (Value - FltOfs);
21313     //  Fist-to-mem64 FistSrc
21314     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21315     //  to XOR'ing the high 32 bits with Adjust.
21316     //
21317     // Being a power of 2, Thresh is exactly representable in all FP formats.
21318     // For X87 we'd like to use the smallest FP type for this constant, but
21319     // for DAG type consistency we have to match the FP operand type.
21320 
21321     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21322     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
21323     bool LosesInfo = false;
21324     if (TheVT == MVT::f64)
21325       // The rounding mode is irrelevant as the conversion should be exact.
21326       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21327                               &LosesInfo);
21328     else if (TheVT == MVT::f80)
21329       Status = Thresh.convert(APFloat::x87DoubleExtended(),
21330                               APFloat::rmNearestTiesToEven, &LosesInfo);
21331 
21332     assert(Status == APFloat::opOK && !LosesInfo &&
21333            "FP conversion should have been exact");
21334 
21335     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21336 
21337     EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21338                                    *DAG.getContext(), TheVT);
21339     SDValue Cmp;
21340     if (IsStrict) {
21341       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21342                          /*IsSignaling*/ true);
21343       Chain = Cmp.getValue(1);
21344     } else {
21345       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21346     }
21347 
21348     // Our preferred lowering of
21349     //
21350     // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21351     //
21352     // is
21353     //
21354     // (Value >= Thresh) << 63
21355     //
21356     // but since we can get here after LegalOperations, DAGCombine might do the
21357     // wrong thing if we create a select. So, directly create the preferred
21358     // version.
21359     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21360     SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21361     Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21362 
21363     SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21364                                    DAG.getConstantFP(0.0, DL, TheVT));
21365 
21366     if (IsStrict) {
21367       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21368                           { Chain, Value, FltOfs });
21369       Chain = Value.getValue(1);
21370     } else
21371       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21372   }
21373 
21374   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21375 
21376   // FIXME This causes a redundant load/store if the SSE-class value is already
21377   // in memory, such as if it is on the callstack.
21378   if (isScalarFPTypeInSSEReg(TheVT)) {
21379     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
21380     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21381     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21382     SDValue Ops[] = { Chain, StackSlot };
21383 
21384     unsigned FLDSize = TheVT.getStoreSize();
21385     assert(FLDSize <= MemSize && "Stack slot not big enough");
21386     MachineMemOperand *MMO = MF.getMachineMemOperand(
21387         MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21388     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21389     Chain = Value.getValue(1);
21390   }
21391 
21392   // Build the FP_TO_INT*_IN_MEM
21393   MachineMemOperand *MMO = MF.getMachineMemOperand(
21394       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21395   SDValue Ops[] = { Chain, Value, StackSlot };
21396   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21397                                          DAG.getVTList(MVT::Other),
21398                                          Ops, DstTy, MMO);
21399 
21400   SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
21401   Chain = Res.getValue(1);
21402 
21403   // If we need an unsigned fixup, XOR the result with adjust.
21404   if (UnsignedFixup)
21405     Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21406 
21407   return Res;
21408 }
21409 
21410 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
21411                               const X86Subtarget &Subtarget) {
21412   MVT VT = Op.getSimpleValueType();
21413   SDValue In = Op.getOperand(0);
21414   MVT InVT = In.getSimpleValueType();
21415   SDLoc dl(Op);
21416   unsigned Opc = Op.getOpcode();
21417 
21418   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
21419   assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
21420          "Unexpected extension opcode");
21421   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
21422          "Expected same number of elements");
21423   assert((VT.getVectorElementType() == MVT::i16 ||
21424           VT.getVectorElementType() == MVT::i32 ||
21425           VT.getVectorElementType() == MVT::i64) &&
21426          "Unexpected element type");
21427   assert((InVT.getVectorElementType() == MVT::i8 ||
21428           InVT.getVectorElementType() == MVT::i16 ||
21429           InVT.getVectorElementType() == MVT::i32) &&
21430          "Unexpected element type");
21431 
21432   unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
21433 
21434   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21435     assert(InVT == MVT::v32i8 && "Unexpected VT!");
21436     return splitVectorIntUnary(Op, DAG);
21437   }
21438 
21439   if (Subtarget.hasInt256())
21440     return Op;
21441 
21442   // Optimize vectors in AVX mode:
21443   //
21444   //   v8i16 -> v8i32
21445   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
21446   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
21447   //   Concat upper and lower parts.
21448   //
21449   //   v4i32 -> v4i64
21450   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
21451   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
21452   //   Concat upper and lower parts.
21453   //
21454   MVT HalfVT = VT.getHalfNumVectorElementsVT();
21455   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21456 
21457   // Short-circuit if we can determine that each 128-bit half is the same value.
21458   // Otherwise, this is difficult to match and optimize.
21459   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21460     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21461       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21462 
21463   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21464   SDValue Undef = DAG.getUNDEF(InVT);
21465   bool NeedZero = Opc == ISD::ZERO_EXTEND;
21466   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21467   OpHi = DAG.getBitcast(HalfVT, OpHi);
21468 
21469   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21470 }
21471 
21472 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21473 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21474                                    const SDLoc &dl, SelectionDAG &DAG) {
21475   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
21476   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21477                            DAG.getIntPtrConstant(0, dl));
21478   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21479                            DAG.getIntPtrConstant(8, dl));
21480   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21481   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21482   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21483   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21484 }
21485 
21486 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
21487                                       const X86Subtarget &Subtarget,
21488                                       SelectionDAG &DAG) {
21489   MVT VT = Op->getSimpleValueType(0);
21490   SDValue In = Op->getOperand(0);
21491   MVT InVT = In.getSimpleValueType();
21492   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
21493   SDLoc DL(Op);
21494   unsigned NumElts = VT.getVectorNumElements();
21495 
21496   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21497   // avoids a constant pool load.
21498   if (VT.getVectorElementType() != MVT::i8) {
21499     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21500     return DAG.getNode(ISD::SRL, DL, VT, Extend,
21501                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21502   }
21503 
21504   // Extend VT if BWI is not supported.
21505   MVT ExtVT = VT;
21506   if (!Subtarget.hasBWI()) {
21507     // If v16i32 is to be avoided, we'll need to split and concatenate.
21508     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21509       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21510 
21511     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21512   }
21513 
21514   // Widen to 512-bits if VLX is not supported.
21515   MVT WideVT = ExtVT;
21516   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21517     NumElts *= 512 / ExtVT.getSizeInBits();
21518     InVT = MVT::getVectorVT(MVT::i1, NumElts);
21519     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
21520                      In, DAG.getIntPtrConstant(0, DL));
21521     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
21522                               NumElts);
21523   }
21524 
21525   SDValue One = DAG.getConstant(1, DL, WideVT);
21526   SDValue Zero = DAG.getConstant(0, DL, WideVT);
21527 
21528   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21529 
21530   // Truncate if we had to extend above.
21531   if (VT != ExtVT) {
21532     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21533     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21534   }
21535 
21536   // Extract back to 128/256-bit if we widened.
21537   if (WideVT != VT)
21538     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21539                               DAG.getIntPtrConstant(0, DL));
21540 
21541   return SelectedVal;
21542 }
21543 
21544 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21545                                 SelectionDAG &DAG) {
21546   SDValue In = Op.getOperand(0);
21547   MVT SVT = In.getSimpleValueType();
21548 
21549   if (SVT.getVectorElementType() == MVT::i1)
21550     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
21551 
21552   assert(Subtarget.hasAVX() && "Expected AVX support");
21553   return LowerAVXExtend(Op, DAG, Subtarget);
21554 }
21555 
21556 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21557 /// It makes use of the fact that vectors with enough leading sign/zero bits
21558 /// prevent the PACKSS/PACKUS from saturating the results.
21559 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21560 /// within each 128-bit lane.
21561 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21562                                       const SDLoc &DL, SelectionDAG &DAG,
21563                                       const X86Subtarget &Subtarget) {
21564   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21565          "Unexpected PACK opcode");
21566   assert(DstVT.isVector() && "VT not a vector?");
21567 
21568   // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21569   if (!Subtarget.hasSSE2())
21570     return SDValue();
21571 
21572   EVT SrcVT = In.getValueType();
21573 
21574   // No truncation required, we might get here due to recursive calls.
21575   if (SrcVT == DstVT)
21576     return In;
21577 
21578   // We only support vector truncation to 64bits or greater from a
21579   // 128bits or greater source.
21580   unsigned DstSizeInBits = DstVT.getSizeInBits();
21581   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21582   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
21583     return SDValue();
21584 
21585   unsigned NumElems = SrcVT.getVectorNumElements();
21586   if (!isPowerOf2_32(NumElems))
21587     return SDValue();
21588 
21589   LLVMContext &Ctx = *DAG.getContext();
21590   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21591   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21592 
21593   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21594 
21595   // Pack to the largest type possible:
21596   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21597   EVT InVT = MVT::i16, OutVT = MVT::i8;
21598   if (SrcVT.getScalarSizeInBits() > 16 &&
21599       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21600     InVT = MVT::i32;
21601     OutVT = MVT::i16;
21602   }
21603 
21604   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
21605   if (SrcVT.is128BitVector()) {
21606     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21607     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21608     In = DAG.getBitcast(InVT, In);
21609     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21610     Res = extractSubVector(Res, 0, DAG, DL, 64);
21611     return DAG.getBitcast(DstVT, Res);
21612   }
21613 
21614   // Split lower/upper subvectors.
21615   SDValue Lo, Hi;
21616   std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21617 
21618   unsigned SubSizeInBits = SrcSizeInBits / 2;
21619   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21620   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21621 
21622   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21623   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21624     Lo = DAG.getBitcast(InVT, Lo);
21625     Hi = DAG.getBitcast(InVT, Hi);
21626     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21627     return DAG.getBitcast(DstVT, Res);
21628   }
21629 
21630   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21631   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21632   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21633     Lo = DAG.getBitcast(InVT, Lo);
21634     Hi = DAG.getBitcast(InVT, Hi);
21635     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21636 
21637     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21638     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21639     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21640     SmallVector<int, 64> Mask;
21641     int Scale = 64 / OutVT.getScalarSizeInBits();
21642     narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21643     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21644 
21645     if (DstVT.is256BitVector())
21646       return DAG.getBitcast(DstVT, Res);
21647 
21648     // If 512bit -> 128bit truncate another stage.
21649     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21650     Res = DAG.getBitcast(PackedVT, Res);
21651     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21652   }
21653 
21654   // Recursively pack lower/upper subvectors, concat result and pack again.
21655   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21656   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21657   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21658   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21659 
21660   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21661   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21662   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21663 }
21664 
21665 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21666                                   const X86Subtarget &Subtarget) {
21667 
21668   SDLoc DL(Op);
21669   MVT VT = Op.getSimpleValueType();
21670   SDValue In = Op.getOperand(0);
21671   MVT InVT = In.getSimpleValueType();
21672 
21673   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21674 
21675   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21676   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21677   if (InVT.getScalarSizeInBits() <= 16) {
21678     if (Subtarget.hasBWI()) {
21679       // legal, will go to VPMOVB2M, VPMOVW2M
21680       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21681         // We need to shift to get the lsb into sign position.
21682         // Shift packed bytes not supported natively, bitcast to word
21683         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21684         In = DAG.getNode(ISD::SHL, DL, ExtVT,
21685                          DAG.getBitcast(ExtVT, In),
21686                          DAG.getConstant(ShiftInx, DL, ExtVT));
21687         In = DAG.getBitcast(InVT, In);
21688       }
21689       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21690                           In, ISD::SETGT);
21691     }
21692     // Use TESTD/Q, extended vector to packed dword/qword.
21693     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21694            "Unexpected vector type.");
21695     unsigned NumElts = InVT.getVectorNumElements();
21696     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21697     // We need to change to a wider element type that we have support for.
21698     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21699     // For 16 element vectors we extend to v16i32 unless we are explicitly
21700     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21701     // we need to split into two 8 element vectors which we can extend to v8i32,
21702     // truncate and concat the results. There's an additional complication if
21703     // the original type is v16i8. In that case we can't split the v16i8
21704     // directly, so we need to shuffle high elements to low and use
21705     // sign_extend_vector_inreg.
21706     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21707       SDValue Lo, Hi;
21708       if (InVT == MVT::v16i8) {
21709         Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21710         Hi = DAG.getVectorShuffle(
21711             InVT, DL, In, In,
21712             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21713         Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21714       } else {
21715         assert(InVT == MVT::v16i16 && "Unexpected VT!");
21716         Lo = extract128BitVector(In, 0, DAG, DL);
21717         Hi = extract128BitVector(In, 8, DAG, DL);
21718       }
21719       // We're split now, just emit two truncates and a concat. The two
21720       // truncates will trigger legalization to come back to this function.
21721       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21722       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21723       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21724     }
21725     // We either have 8 elements or we're allowed to use 512-bit vectors.
21726     // If we have VLX, we want to use the narrowest vector that can get the
21727     // job done so we use vXi32.
21728     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21729     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21730     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21731     InVT = ExtVT;
21732     ShiftInx = InVT.getScalarSizeInBits() - 1;
21733   }
21734 
21735   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21736     // We need to shift to get the lsb into sign position.
21737     In = DAG.getNode(ISD::SHL, DL, InVT, In,
21738                      DAG.getConstant(ShiftInx, DL, InVT));
21739   }
21740   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21741   if (Subtarget.hasDQI())
21742     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21743   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21744 }
21745 
21746 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21747   SDLoc DL(Op);
21748   MVT VT = Op.getSimpleValueType();
21749   SDValue In = Op.getOperand(0);
21750   MVT InVT = In.getSimpleValueType();
21751   unsigned InNumEltBits = InVT.getScalarSizeInBits();
21752 
21753   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
21754          "Invalid TRUNCATE operation");
21755 
21756   // If we're called by the type legalizer, handle a few cases.
21757   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21758   if (!TLI.isTypeLegal(InVT)) {
21759     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21760         VT.is128BitVector()) {
21761       assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21762              "Unexpected subtarget!");
21763       // The default behavior is to truncate one step, concatenate, and then
21764       // truncate the remainder. We'd rather produce two 64-bit results and
21765       // concatenate those.
21766       SDValue Lo, Hi;
21767       std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21768 
21769       EVT LoVT, HiVT;
21770       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21771 
21772       Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21773       Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21774       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21775     }
21776 
21777     // Otherwise let default legalization handle it.
21778     return SDValue();
21779   }
21780 
21781   if (VT.getVectorElementType() == MVT::i1)
21782     return LowerTruncateVecI1(Op, DAG, Subtarget);
21783 
21784   // vpmovqb/w/d, vpmovdb/w, vpmovwb
21785   if (Subtarget.hasAVX512()) {
21786     if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21787       assert(VT == MVT::v32i8 && "Unexpected VT!");
21788       return splitVectorIntUnary(Op, DAG);
21789     }
21790 
21791     // word to byte only under BWI. Otherwise we have to promoted to v16i32
21792     // and then truncate that. But we should only do that if we haven't been
21793     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21794     // handled by isel patterns.
21795     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21796         Subtarget.canExtendTo512DQ())
21797       return Op;
21798   }
21799 
21800   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21801   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21802 
21803   // Truncate with PACKUS if we are truncating a vector with leading zero bits
21804   // that extend all the way to the packed/truncated value.
21805   // Pre-SSE41 we can only use PACKUSWB.
21806   KnownBits Known = DAG.computeKnownBits(In);
21807   if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21808     if (SDValue V =
21809             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21810       return V;
21811 
21812   // Truncate with PACKSS if we are truncating a vector with sign-bits that
21813   // extend all the way to the packed/truncated value.
21814   if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21815     if (SDValue V =
21816             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21817       return V;
21818 
21819   // Handle truncation of V256 to V128 using shuffles.
21820   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21821 
21822   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21823     In = DAG.getBitcast(MVT::v8i32, In);
21824 
21825     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21826     if (Subtarget.hasInt256()) {
21827       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21828       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21829       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21830                          DAG.getIntPtrConstant(0, DL));
21831     }
21832 
21833     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21834                                DAG.getIntPtrConstant(0, DL));
21835     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21836                                DAG.getIntPtrConstant(4, DL));
21837     static const int ShufMask[] = {0, 2, 4, 6};
21838     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21839   }
21840 
21841   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21842     In = DAG.getBitcast(MVT::v32i8, In);
21843 
21844     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21845     if (Subtarget.hasInt256()) {
21846       // The PSHUFB mask:
21847       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
21848                                       -1, -1, -1, -1, -1, -1, -1, -1,
21849                                       16, 17, 20, 21, 24, 25, 28, 29,
21850                                       -1, -1, -1, -1, -1, -1, -1, -1 };
21851       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21852       In = DAG.getBitcast(MVT::v4i64, In);
21853 
21854       static const int ShufMask2[] = {0, 2, -1, -1};
21855       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21856       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21857                          DAG.getBitcast(MVT::v16i16, In),
21858                          DAG.getIntPtrConstant(0, DL));
21859     }
21860 
21861     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21862                                DAG.getIntPtrConstant(0, DL));
21863     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21864                                DAG.getIntPtrConstant(16, DL));
21865 
21866     // The PSHUFB mask:
21867     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
21868                                    -1, -1, -1, -1, -1, -1, -1, -1};
21869 
21870     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21871     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21872 
21873     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21874     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21875 
21876     // The MOVLHPS Mask:
21877     static const int ShufMask2[] = {0, 1, 4, 5};
21878     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21879     return DAG.getBitcast(MVT::v8i16, res);
21880   }
21881 
21882   if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21883     // Use an AND to zero uppper bits for PACKUS.
21884     In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21885 
21886     SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21887                                DAG.getIntPtrConstant(0, DL));
21888     SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21889                                DAG.getIntPtrConstant(8, DL));
21890     return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21891   }
21892 
21893   llvm_unreachable("All 256->128 cases should have been handled above!");
21894 }
21895 
21896 // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21897 // behaves on out of range inputs to generate optimized conversions.
21898 static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21899                                     SelectionDAG &DAG,
21900                                     const X86Subtarget &Subtarget) {
21901   MVT SrcVT = Src.getSimpleValueType();
21902   unsigned DstBits = VT.getScalarSizeInBits();
21903   assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21904 
21905   // Calculate the converted result for values in the range 0 to
21906   // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21907   SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21908   SDValue Big =
21909       DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21910                   DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21911                               DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21912 
21913   // The "CVTTP2SI" instruction conveniently sets the sign bit if
21914   // and only if the value was out of range. So we can use that
21915   // as our indicator that we rather use "Big" instead of "Small".
21916   //
21917   // Use "Small" if "IsOverflown" has all bits cleared
21918   // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21919 
21920   // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21921   // use the slightly slower blendv select instead.
21922   if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21923     SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21924     return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21925   }
21926 
21927   SDValue IsOverflown =
21928       DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21929                   DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21930   return DAG.getNode(ISD::OR, dl, VT, Small,
21931                      DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21932 }
21933 
21934 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21935   bool IsStrict = Op->isStrictFPOpcode();
21936   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21937                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21938   MVT VT = Op->getSimpleValueType(0);
21939   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21940   SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21941   MVT SrcVT = Src.getSimpleValueType();
21942   SDLoc dl(Op);
21943 
21944   SDValue Res;
21945   if (VT.isVector()) {
21946     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21947       MVT ResVT = MVT::v4i32;
21948       MVT TruncVT = MVT::v4i1;
21949       unsigned Opc;
21950       if (IsStrict)
21951         Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21952       else
21953         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21954 
21955       if (!IsSigned && !Subtarget.hasVLX()) {
21956         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21957         // Widen to 512-bits.
21958         ResVT = MVT::v8i32;
21959         TruncVT = MVT::v8i1;
21960         Opc = Op.getOpcode();
21961         // Need to concat with zero vector for strict fp to avoid spurious
21962         // exceptions.
21963         // TODO: Should we just do this for non-strict as well?
21964         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21965                                : DAG.getUNDEF(MVT::v8f64);
21966         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21967                           DAG.getIntPtrConstant(0, dl));
21968       }
21969       if (IsStrict) {
21970         Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21971         Chain = Res.getValue(1);
21972       } else {
21973         Res = DAG.getNode(Opc, dl, ResVT, Src);
21974       }
21975 
21976       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21977       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21978                         DAG.getIntPtrConstant(0, dl));
21979       if (IsStrict)
21980         return DAG.getMergeValues({Res, Chain}, dl);
21981       return Res;
21982     }
21983 
21984     if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21985       if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21986         return Op;
21987 
21988       MVT ResVT = VT;
21989       MVT EleVT = VT.getVectorElementType();
21990       if (EleVT != MVT::i64)
21991         ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21992 
21993       if (SrcVT != MVT::v8f16) {
21994         SDValue Tmp =
21995             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21996         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21997         Ops[0] = Src;
21998         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21999       }
22000 
22001       if (IsStrict) {
22002         Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
22003                                    : X86ISD::STRICT_CVTTP2UI,
22004                           dl, {ResVT, MVT::Other}, {Chain, Src});
22005         Chain = Res.getValue(1);
22006       } else {
22007         Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
22008                           ResVT, Src);
22009       }
22010 
22011       // TODO: Need to add exception check code for strict FP.
22012       if (EleVT.getSizeInBits() < 16) {
22013         ResVT = MVT::getVectorVT(EleVT, 8);
22014         Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
22015       }
22016 
22017       if (ResVT != VT)
22018         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22019                           DAG.getIntPtrConstant(0, dl));
22020 
22021       if (IsStrict)
22022         return DAG.getMergeValues({Res, Chain}, dl);
22023       return Res;
22024     }
22025 
22026     if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
22027       if (IsStrict) {
22028         Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
22029                                    : ISD::STRICT_FP_TO_UINT,
22030                           dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
22031         Chain = Res.getValue(1);
22032       } else {
22033         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
22034                           MVT::v8i32, Src);
22035       }
22036 
22037       // TODO: Need to add exception check code for strict FP.
22038       Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
22039 
22040       if (IsStrict)
22041         return DAG.getMergeValues({Res, Chain}, dl);
22042       return Res;
22043     }
22044 
22045     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
22046     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
22047       assert(!IsSigned && "Expected unsigned conversion!");
22048       assert(Subtarget.useAVX512Regs() && "Requires avx512f");
22049       return Op;
22050     }
22051 
22052     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
22053     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
22054         (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
22055         Subtarget.useAVX512Regs()) {
22056       assert(!IsSigned && "Expected unsigned conversion!");
22057       assert(!Subtarget.hasVLX() && "Unexpected features!");
22058       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22059       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22060       // Need to concat with zero vector for strict fp to avoid spurious
22061       // exceptions.
22062       // TODO: Should we just do this for non-strict as well?
22063       SDValue Tmp =
22064           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22065       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22066                         DAG.getIntPtrConstant(0, dl));
22067 
22068       if (IsStrict) {
22069         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
22070                           {Chain, Src});
22071         Chain = Res.getValue(1);
22072       } else {
22073         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
22074       }
22075 
22076       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22077                         DAG.getIntPtrConstant(0, dl));
22078 
22079       if (IsStrict)
22080         return DAG.getMergeValues({Res, Chain}, dl);
22081       return Res;
22082     }
22083 
22084     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
22085     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
22086         (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
22087         Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
22088       assert(!Subtarget.hasVLX() && "Unexpected features!");
22089       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
22090       // Need to concat with zero vector for strict fp to avoid spurious
22091       // exceptions.
22092       // TODO: Should we just do this for non-strict as well?
22093       SDValue Tmp =
22094           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22095       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22096                         DAG.getIntPtrConstant(0, dl));
22097 
22098       if (IsStrict) {
22099         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22100                           {Chain, Src});
22101         Chain = Res.getValue(1);
22102       } else {
22103         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
22104       }
22105 
22106       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22107                         DAG.getIntPtrConstant(0, dl));
22108 
22109       if (IsStrict)
22110         return DAG.getMergeValues({Res, Chain}, dl);
22111       return Res;
22112     }
22113 
22114     if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
22115       if (!Subtarget.hasVLX()) {
22116         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
22117         // legalizer and then widened again by vector op legalization.
22118         if (!IsStrict)
22119           return SDValue();
22120 
22121         SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
22122         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
22123                                   {Src, Zero, Zero, Zero});
22124         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22125                           {Chain, Tmp});
22126         SDValue Chain = Tmp.getValue(1);
22127         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
22128                           DAG.getIntPtrConstant(0, dl));
22129         return DAG.getMergeValues({Tmp, Chain}, dl);
22130       }
22131 
22132       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
22133       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
22134                                 DAG.getUNDEF(MVT::v2f32));
22135       if (IsStrict) {
22136         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
22137                                 : X86ISD::STRICT_CVTTP2UI;
22138         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
22139       }
22140       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22141       return DAG.getNode(Opc, dl, VT, Tmp);
22142     }
22143 
22144     // Generate optimized instructions for pre AVX512 unsigned conversions from
22145     // vXf32 to vXi32.
22146     if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
22147         (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
22148         (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
22149       assert(!IsSigned && "Expected unsigned conversion!");
22150       return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
22151     }
22152 
22153     return SDValue();
22154   }
22155 
22156   assert(!VT.isVector());
22157 
22158   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
22159 
22160   if (!IsSigned && UseSSEReg) {
22161     // Conversions from f32/f64 with AVX512 should be legal.
22162     if (Subtarget.hasAVX512())
22163       return Op;
22164 
22165     // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
22166     // behaves on out of range inputs to generate optimized conversions.
22167     if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
22168                       (VT == MVT::i64 && Subtarget.is64Bit()))) {
22169       unsigned DstBits = VT.getScalarSizeInBits();
22170       APInt UIntLimit = APInt::getSignMask(DstBits);
22171       SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
22172                                         DAG.getConstant(UIntLimit, dl, VT));
22173       MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
22174 
22175       // Calculate the converted result for values in the range:
22176       // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22177       // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
22178       SDValue Small =
22179           DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
22180                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
22181       SDValue Big = DAG.getNode(
22182           X86ISD::CVTTS2SI, dl, VT,
22183           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
22184                       DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
22185 
22186       // The "CVTTS2SI" instruction conveniently sets the sign bit if
22187       // and only if the value was out of range. So we can use that
22188       // as our indicator that we rather use "Big" instead of "Small".
22189       //
22190       // Use "Small" if "IsOverflown" has all bits cleared
22191       // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22192       SDValue IsOverflown = DAG.getNode(
22193           ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
22194       return DAG.getNode(ISD::OR, dl, VT, Small,
22195                          DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22196     }
22197 
22198     // Use default expansion for i64.
22199     if (VT == MVT::i64)
22200       return SDValue();
22201 
22202     assert(VT == MVT::i32 && "Unexpected VT!");
22203 
22204     // Promote i32 to i64 and use a signed operation on 64-bit targets.
22205     // FIXME: This does not generate an invalid exception if the input does not
22206     // fit in i32. PR44019
22207     if (Subtarget.is64Bit()) {
22208       if (IsStrict) {
22209         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
22210                           {Chain, Src});
22211         Chain = Res.getValue(1);
22212       } else
22213         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
22214 
22215       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22216       if (IsStrict)
22217         return DAG.getMergeValues({Res, Chain}, dl);
22218       return Res;
22219     }
22220 
22221     // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
22222     // use fisttp which will be handled later.
22223     if (!Subtarget.hasSSE3())
22224       return SDValue();
22225   }
22226 
22227   // Promote i16 to i32 if we can use a SSE operation or the type is f128.
22228   // FIXME: This does not generate an invalid exception if the input does not
22229   // fit in i16. PR44019
22230   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
22231     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
22232     if (IsStrict) {
22233       Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
22234                         {Chain, Src});
22235       Chain = Res.getValue(1);
22236     } else
22237       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
22238 
22239     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22240     if (IsStrict)
22241       return DAG.getMergeValues({Res, Chain}, dl);
22242     return Res;
22243   }
22244 
22245   // If this is a FP_TO_SINT using SSEReg we're done.
22246   if (UseSSEReg && IsSigned)
22247     return Op;
22248 
22249   // fp128 needs to use a libcall.
22250   if (SrcVT == MVT::f128) {
22251     RTLIB::Libcall LC;
22252     if (IsSigned)
22253       LC = RTLIB::getFPTOSINT(SrcVT, VT);
22254     else
22255       LC = RTLIB::getFPTOUINT(SrcVT, VT);
22256 
22257     MakeLibCallOptions CallOptions;
22258     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
22259                                                   SDLoc(Op), Chain);
22260 
22261     if (IsStrict)
22262       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
22263 
22264     return Tmp.first;
22265   }
22266 
22267   // Fall back to X87.
22268   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
22269     if (IsStrict)
22270       return DAG.getMergeValues({V, Chain}, dl);
22271     return V;
22272   }
22273 
22274   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
22275 }
22276 
22277 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
22278                                              SelectionDAG &DAG) const {
22279   SDValue Src = Op.getOperand(0);
22280   MVT SrcVT = Src.getSimpleValueType();
22281 
22282   // If the source is in an SSE register, the node is Legal.
22283   if (isScalarFPTypeInSSEReg(SrcVT))
22284     return Op;
22285 
22286   return LRINT_LLRINTHelper(Op.getNode(), DAG);
22287 }
22288 
22289 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22290                                               SelectionDAG &DAG) const {
22291   EVT DstVT = N->getValueType(0);
22292   SDValue Src = N->getOperand(0);
22293   EVT SrcVT = Src.getValueType();
22294 
22295   if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22296     // f16 must be promoted before using the lowering in this routine.
22297     // fp128 does not use this lowering.
22298     return SDValue();
22299   }
22300 
22301   SDLoc DL(N);
22302   SDValue Chain = DAG.getEntryNode();
22303 
22304   bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22305 
22306   // If we're converting from SSE, the stack slot needs to hold both types.
22307   // Otherwise it only needs to hold the DstVT.
22308   EVT OtherVT = UseSSE ? SrcVT : DstVT;
22309   SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22310   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22311   MachinePointerInfo MPI =
22312       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
22313 
22314   if (UseSSE) {
22315     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22316     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22317     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22318     SDValue Ops[] = { Chain, StackPtr };
22319 
22320     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22321                                   /*Align*/ None, MachineMemOperand::MOLoad);
22322     Chain = Src.getValue(1);
22323   }
22324 
22325   SDValue StoreOps[] = { Chain, Src, StackPtr };
22326   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22327                                   StoreOps, DstVT, MPI, /*Align*/ None,
22328                                   MachineMemOperand::MOStore);
22329 
22330   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22331 }
22332 
22333 SDValue
22334 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22335   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22336   // but making use of X86 specifics to produce better instruction sequences.
22337   SDNode *Node = Op.getNode();
22338   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22339   unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22340   SDLoc dl(SDValue(Node, 0));
22341   SDValue Src = Node->getOperand(0);
22342 
22343   // There are three types involved here: SrcVT is the source floating point
22344   // type, DstVT is the type of the result, and TmpVT is the result of the
22345   // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22346   // DstVT).
22347   EVT SrcVT = Src.getValueType();
22348   EVT DstVT = Node->getValueType(0);
22349   EVT TmpVT = DstVT;
22350 
22351   // This code is only for floats and doubles. Fall back to generic code for
22352   // anything else.
22353   if (!isScalarFPTypeInSSEReg(SrcVT))
22354     return SDValue();
22355 
22356   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22357   unsigned SatWidth = SatVT.getScalarSizeInBits();
22358   unsigned DstWidth = DstVT.getScalarSizeInBits();
22359   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22360   assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22361          "Expected saturation width smaller than result width");
22362 
22363   // Promote result of FP_TO_*INT to at least 32 bits.
22364   if (TmpWidth < 32) {
22365     TmpVT = MVT::i32;
22366     TmpWidth = 32;
22367   }
22368 
22369   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22370   // us to use a native signed conversion instead.
22371   if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22372     TmpVT = MVT::i64;
22373     TmpWidth = 64;
22374   }
22375 
22376   // If the saturation width is smaller than the size of the temporary result,
22377   // we can always use signed conversion, which is native.
22378   if (SatWidth < TmpWidth)
22379     FpToIntOpcode = ISD::FP_TO_SINT;
22380 
22381   // Determine minimum and maximum integer values and their corresponding
22382   // floating-point values.
22383   APInt MinInt, MaxInt;
22384   if (IsSigned) {
22385     MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
22386     MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
22387   } else {
22388     MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
22389     MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
22390   }
22391 
22392   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22393   APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22394 
22395   APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22396     MinInt, IsSigned, APFloat::rmTowardZero);
22397   APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22398     MaxInt, IsSigned, APFloat::rmTowardZero);
22399   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22400                           && !(MaxStatus & APFloat::opStatus::opInexact);
22401 
22402   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22403   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22404 
22405   // If the integer bounds are exactly representable as floats, emit a
22406   // min+max+fptoi sequence. Otherwise use comparisons and selects.
22407   if (AreExactFloatBounds) {
22408     if (DstVT != TmpVT) {
22409       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22410       SDValue MinClamped = DAG.getNode(
22411         X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22412       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22413       SDValue BothClamped = DAG.getNode(
22414         X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22415       // Convert clamped value to integer.
22416       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22417 
22418       // NaN will become INDVAL, with the top bit set and the rest zero.
22419       // Truncation will discard the top bit, resulting in zero.
22420       return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22421     }
22422 
22423     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22424     SDValue MinClamped = DAG.getNode(
22425       X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22426     // Clamp by MaxFloat from above. NaN cannot occur.
22427     SDValue BothClamped = DAG.getNode(
22428       X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22429     // Convert clamped value to integer.
22430     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22431 
22432     if (!IsSigned) {
22433       // In the unsigned case we're done, because we mapped NaN to MinFloat,
22434       // which is zero.
22435       return FpToInt;
22436     }
22437 
22438     // Otherwise, select zero if Src is NaN.
22439     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22440     return DAG.getSelectCC(
22441       dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22442   }
22443 
22444   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22445   SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22446 
22447   // Result of direct conversion, which may be selected away.
22448   SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22449 
22450   if (DstVT != TmpVT) {
22451     // NaN will become INDVAL, with the top bit set and the rest zero.
22452     // Truncation will discard the top bit, resulting in zero.
22453     FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22454   }
22455 
22456   SDValue Select = FpToInt;
22457   // For signed conversions where we saturate to the same size as the
22458   // result type of the fptoi instructions, INDVAL coincides with integer
22459   // minimum, so we don't need to explicitly check it.
22460   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22461     // If Src ULT MinFloat, select MinInt. In particular, this also selects
22462     // MinInt if Src is NaN.
22463     Select = DAG.getSelectCC(
22464       dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22465   }
22466 
22467   // If Src OGT MaxFloat, select MaxInt.
22468   Select = DAG.getSelectCC(
22469     dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22470 
22471   // In the unsigned case we are done, because we mapped NaN to MinInt, which
22472   // is already zero. The promoted case was already handled above.
22473   if (!IsSigned || DstVT != TmpVT) {
22474     return Select;
22475   }
22476 
22477   // Otherwise, select 0 if Src is NaN.
22478   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22479   return DAG.getSelectCC(
22480     dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22481 }
22482 
22483 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22484   bool IsStrict = Op->isStrictFPOpcode();
22485 
22486   SDLoc DL(Op);
22487   MVT VT = Op.getSimpleValueType();
22488   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22489   MVT SVT = In.getSimpleValueType();
22490 
22491   if (VT == MVT::f128)
22492     return SDValue();
22493 
22494   if (VT == MVT::f80) {
22495     if (SVT == MVT::f16) {
22496       assert(Subtarget.hasFP16() && "Unexpected features!");
22497       RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
22498       MakeLibCallOptions CallOptions;
22499       std::pair<SDValue, SDValue> Tmp =
22500           makeLibCall(DAG, LC, VT, In, CallOptions, DL,
22501                       IsStrict ? Op.getOperand(0) : SDValue());
22502       if (IsStrict)
22503         return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
22504       else
22505         return Tmp.first;
22506     }
22507     return Op;
22508   }
22509 
22510   if (SVT.getVectorElementType() == MVT::f16) {
22511     assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
22512     if (SVT == MVT::v2f16)
22513       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22514                        DAG.getUNDEF(MVT::v2f16));
22515     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22516                               DAG.getUNDEF(MVT::v4f16));
22517     if (IsStrict)
22518       return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22519                          {Op->getOperand(0), Res});
22520     return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22521   }
22522 
22523   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22524 
22525   SDValue Res =
22526       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22527   if (IsStrict)
22528     return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22529                        {Op->getOperand(0), Res});
22530   return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22531 }
22532 
22533 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22534   bool IsStrict = Op->isStrictFPOpcode();
22535   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22536   MVT VT = Op.getSimpleValueType();
22537   MVT SVT = In.getSimpleValueType();
22538 
22539   // It's legal except when f128 is involved or we're converting f80->f16.
22540   if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
22541     return Op;
22542 
22543   return SDValue();
22544 }
22545 
22546 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
22547   bool IsStrict = Op->isStrictFPOpcode();
22548   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22549   assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22550          "Unexpected VT!");
22551 
22552   SDLoc dl(Op);
22553   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22554                             DAG.getConstant(0, dl, MVT::v8i16), Src,
22555                             DAG.getIntPtrConstant(0, dl));
22556 
22557   SDValue Chain;
22558   if (IsStrict) {
22559     Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22560                       {Op.getOperand(0), Res});
22561     Chain = Res.getValue(1);
22562   } else {
22563     Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22564   }
22565 
22566   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22567                     DAG.getIntPtrConstant(0, dl));
22568 
22569   if (IsStrict)
22570     return DAG.getMergeValues({Res, Chain}, dl);
22571 
22572   return Res;
22573 }
22574 
22575 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
22576   bool IsStrict = Op->isStrictFPOpcode();
22577   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22578   assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22579          "Unexpected VT!");
22580 
22581   SDLoc dl(Op);
22582   SDValue Res, Chain;
22583   if (IsStrict) {
22584     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22585                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22586                       DAG.getIntPtrConstant(0, dl));
22587     Res = DAG.getNode(
22588         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22589         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22590     Chain = Res.getValue(1);
22591   } else {
22592     // FIXME: Should we use zeros for upper elements for non-strict?
22593     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22594     Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22595                       DAG.getTargetConstant(4, dl, MVT::i32));
22596   }
22597 
22598   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22599                     DAG.getIntPtrConstant(0, dl));
22600 
22601   if (IsStrict)
22602     return DAG.getMergeValues({Res, Chain}, dl);
22603 
22604   return Res;
22605 }
22606 
22607 /// Depending on uarch and/or optimizing for size, we might prefer to use a
22608 /// vector operation in place of the typical scalar operation.
22609 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
22610                                          const X86Subtarget &Subtarget) {
22611   // If both operands have other uses, this is probably not profitable.
22612   SDValue LHS = Op.getOperand(0);
22613   SDValue RHS = Op.getOperand(1);
22614   if (!LHS.hasOneUse() && !RHS.hasOneUse())
22615     return Op;
22616 
22617   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22618   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22619   if (IsFP && !Subtarget.hasSSE3())
22620     return Op;
22621   if (!IsFP && !Subtarget.hasSSSE3())
22622     return Op;
22623 
22624   // Extract from a common vector.
22625   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22626       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22627       LHS.getOperand(0) != RHS.getOperand(0) ||
22628       !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22629       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22630       !shouldUseHorizontalOp(true, DAG, Subtarget))
22631     return Op;
22632 
22633   // Allow commuted 'hadd' ops.
22634   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22635   unsigned HOpcode;
22636   switch (Op.getOpcode()) {
22637     case ISD::ADD: HOpcode = X86ISD::HADD; break;
22638     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22639     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22640     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22641     default:
22642       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22643   }
22644   unsigned LExtIndex = LHS.getConstantOperandVal(1);
22645   unsigned RExtIndex = RHS.getConstantOperandVal(1);
22646   if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22647       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22648     std::swap(LExtIndex, RExtIndex);
22649 
22650   if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22651     return Op;
22652 
22653   SDValue X = LHS.getOperand(0);
22654   EVT VecVT = X.getValueType();
22655   unsigned BitWidth = VecVT.getSizeInBits();
22656   unsigned NumLanes = BitWidth / 128;
22657   unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22658   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22659          "Not expecting illegal vector widths here");
22660 
22661   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22662   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22663   SDLoc DL(Op);
22664   if (BitWidth == 256 || BitWidth == 512) {
22665     unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22666     X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22667     LExtIndex %= NumEltsPerLane;
22668   }
22669 
22670   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22671   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22672   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22673   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22674   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22675   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22676                      DAG.getIntPtrConstant(LExtIndex / 2, DL));
22677 }
22678 
22679 /// Depending on uarch and/or optimizing for size, we might prefer to use a
22680 /// vector operation in place of the typical scalar operation.
22681 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22682   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22683          "Only expecting float/double");
22684   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
22685 }
22686 
22687 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22688 /// This mode isn't supported in hardware on X86. But as long as we aren't
22689 /// compiling with trapping math, we can emulate this with
22690 /// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22691 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
22692   SDValue N0 = Op.getOperand(0);
22693   SDLoc dl(Op);
22694   MVT VT = Op.getSimpleValueType();
22695 
22696   // N0 += copysign(nextafter(0.5, 0.0), N0)
22697   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22698   bool Ignored;
22699   APFloat Point5Pred = APFloat(0.5f);
22700   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22701   Point5Pred.next(/*nextDown*/true);
22702 
22703   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22704                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
22705   N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22706 
22707   // Truncate the result to remove fraction.
22708   return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22709 }
22710 
22711 /// The only differences between FABS and FNEG are the mask and the logic op.
22712 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
22713 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22714   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22715          "Wrong opcode for lowering FABS or FNEG.");
22716 
22717   bool IsFABS = (Op.getOpcode() == ISD::FABS);
22718 
22719   // If this is a FABS and it has an FNEG user, bail out to fold the combination
22720   // into an FNABS. We'll lower the FABS after that if it is still in use.
22721   if (IsFABS)
22722     for (SDNode *User : Op->uses())
22723       if (User->getOpcode() == ISD::FNEG)
22724         return Op;
22725 
22726   SDLoc dl(Op);
22727   MVT VT = Op.getSimpleValueType();
22728 
22729   bool IsF128 = (VT == MVT::f128);
22730   assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22731          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
22732          "Unexpected type in LowerFABSorFNEG");
22733 
22734   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22735   // decide if we should generate a 16-byte constant mask when we only need 4 or
22736   // 8 bytes for the scalar case.
22737 
22738   // There are no scalar bitwise logical SSE/AVX instructions, so we
22739   // generate a 16-byte vector constant and logic op even for the scalar case.
22740   // Using a 16-byte mask allows folding the load of the mask with
22741   // the logic op, so it can save (~4 bytes) on code size.
22742   bool IsFakeVector = !VT.isVector() && !IsF128;
22743   MVT LogicVT = VT;
22744   if (IsFakeVector)
22745     LogicVT = (VT == MVT::f64)   ? MVT::v2f64
22746               : (VT == MVT::f32) ? MVT::v4f32
22747                                  : MVT::v8f16;
22748 
22749   unsigned EltBits = VT.getScalarSizeInBits();
22750   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22751   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22752                            APInt::getSignMask(EltBits);
22753   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22754   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22755 
22756   SDValue Op0 = Op.getOperand(0);
22757   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22758   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
22759                      IsFNABS ? X86ISD::FOR  :
22760                                X86ISD::FXOR;
22761   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22762 
22763   if (VT.isVector() || IsF128)
22764     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22765 
22766   // For the scalar case extend to a 128-bit vector, perform the logic op,
22767   // and extract the scalar result back out.
22768   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22769   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22770   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22771                      DAG.getIntPtrConstant(0, dl));
22772 }
22773 
22774 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22775   SDValue Mag = Op.getOperand(0);
22776   SDValue Sign = Op.getOperand(1);
22777   SDLoc dl(Op);
22778 
22779   // If the sign operand is smaller, extend it first.
22780   MVT VT = Op.getSimpleValueType();
22781   if (Sign.getSimpleValueType().bitsLT(VT))
22782     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22783 
22784   // And if it is bigger, shrink it first.
22785   if (Sign.getSimpleValueType().bitsGT(VT))
22786     Sign =
22787         DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22788 
22789   // At this point the operands and the result should have the same
22790   // type, and that won't be f80 since that is not custom lowered.
22791   bool IsF128 = (VT == MVT::f128);
22792   assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22793          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
22794          "Unexpected type in LowerFCOPYSIGN");
22795 
22796   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22797 
22798   // Perform all scalar logic operations as 16-byte vectors because there are no
22799   // scalar FP logic instructions in SSE.
22800   // TODO: This isn't necessary. If we used scalar types, we might avoid some
22801   // unnecessary splats, but we might miss load folding opportunities. Should
22802   // this decision be based on OptimizeForSize?
22803   bool IsFakeVector = !VT.isVector() && !IsF128;
22804   MVT LogicVT = VT;
22805   if (IsFakeVector)
22806     LogicVT = (VT == MVT::f64)   ? MVT::v2f64
22807               : (VT == MVT::f32) ? MVT::v4f32
22808                                  : MVT::v8f16;
22809 
22810   // The mask constants are automatically splatted for vector types.
22811   unsigned EltSizeInBits = VT.getScalarSizeInBits();
22812   SDValue SignMask = DAG.getConstantFP(
22813       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22814   SDValue MagMask = DAG.getConstantFP(
22815       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22816 
22817   // First, clear all bits but the sign bit from the second operand (sign).
22818   if (IsFakeVector)
22819     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22820   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22821 
22822   // Next, clear the sign bit from the first operand (magnitude).
22823   // TODO: If we had general constant folding for FP logic ops, this check
22824   // wouldn't be necessary.
22825   SDValue MagBits;
22826   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22827     APFloat APF = Op0CN->getValueAPF();
22828     APF.clearSign();
22829     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22830   } else {
22831     // If the magnitude operand wasn't a constant, we need to AND out the sign.
22832     if (IsFakeVector)
22833       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22834     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22835   }
22836 
22837   // OR the magnitude value with the sign bit.
22838   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22839   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22840                                           DAG.getIntPtrConstant(0, dl));
22841 }
22842 
22843 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22844   SDValue N0 = Op.getOperand(0);
22845   SDLoc dl(Op);
22846   MVT VT = Op.getSimpleValueType();
22847 
22848   MVT OpVT = N0.getSimpleValueType();
22849   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22850          "Unexpected type for FGETSIGN");
22851 
22852   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22853   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22854   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22855   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22856   Res = DAG.getZExtOrTrunc(Res, dl, VT);
22857   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22858   return Res;
22859 }
22860 
22861 /// Helper for creating a X86ISD::SETCC node.
22862 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22863                         SelectionDAG &DAG) {
22864   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22865                      DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22866 }
22867 
22868 /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22869 /// style scalarized (associative) reduction patterns. Partial reductions
22870 /// are supported when the pointer SrcMask is non-null.
22871 /// TODO - move this to SelectionDAG?
22872 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22873                                  SmallVectorImpl<SDValue> &SrcOps,
22874                                  SmallVectorImpl<APInt> *SrcMask = nullptr) {
22875   SmallVector<SDValue, 8> Opnds;
22876   DenseMap<SDValue, APInt> SrcOpMap;
22877   EVT VT = MVT::Other;
22878 
22879   // Recognize a special case where a vector is casted into wide integer to
22880   // test all 0s.
22881   assert(Op.getOpcode() == unsigned(BinOp) &&
22882          "Unexpected bit reduction opcode");
22883   Opnds.push_back(Op.getOperand(0));
22884   Opnds.push_back(Op.getOperand(1));
22885 
22886   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22887     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22888     // BFS traverse all BinOp operands.
22889     if (I->getOpcode() == unsigned(BinOp)) {
22890       Opnds.push_back(I->getOperand(0));
22891       Opnds.push_back(I->getOperand(1));
22892       // Re-evaluate the number of nodes to be traversed.
22893       e += 2; // 2 more nodes (LHS and RHS) are pushed.
22894       continue;
22895     }
22896 
22897     // Quit if a non-EXTRACT_VECTOR_ELT
22898     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22899       return false;
22900 
22901     // Quit if without a constant index.
22902     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22903     if (!Idx)
22904       return false;
22905 
22906     SDValue Src = I->getOperand(0);
22907     DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22908     if (M == SrcOpMap.end()) {
22909       VT = Src.getValueType();
22910       // Quit if not the same type.
22911       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22912         return false;
22913       unsigned NumElts = VT.getVectorNumElements();
22914       APInt EltCount = APInt::getZero(NumElts);
22915       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22916       SrcOps.push_back(Src);
22917     }
22918 
22919     // Quit if element already used.
22920     unsigned CIdx = Idx->getZExtValue();
22921     if (M->second[CIdx])
22922       return false;
22923     M->second.setBit(CIdx);
22924   }
22925 
22926   if (SrcMask) {
22927     // Collect the source partial masks.
22928     for (SDValue &SrcOp : SrcOps)
22929       SrcMask->push_back(SrcOpMap[SrcOp]);
22930   } else {
22931     // Quit if not all elements are used.
22932     for (const auto &I : SrcOpMap)
22933       if (!I.second.isAllOnes())
22934         return false;
22935   }
22936 
22937   return true;
22938 }
22939 
22940 // Helper function for comparing all bits of a vector against zero.
22941 static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22942                                   const APInt &Mask,
22943                                   const X86Subtarget &Subtarget,
22944                                   SelectionDAG &DAG, X86::CondCode &X86CC) {
22945   EVT VT = V.getValueType();
22946   unsigned ScalarSize = VT.getScalarSizeInBits();
22947   if (Mask.getBitWidth() != ScalarSize) {
22948     assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22949     return SDValue();
22950   }
22951 
22952   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22953   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22954 
22955   auto MaskBits = [&](SDValue Src) {
22956     if (Mask.isAllOnes())
22957       return Src;
22958     EVT SrcVT = Src.getValueType();
22959     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22960     return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22961   };
22962 
22963   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22964   if (VT.getSizeInBits() < 128) {
22965     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22966     if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22967       return SDValue();
22968     return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22969                        DAG.getBitcast(IntVT, MaskBits(V)),
22970                        DAG.getConstant(0, DL, IntVT));
22971   }
22972 
22973   // Quit if not splittable to 128/256-bit vector.
22974   if (!isPowerOf2_32(VT.getSizeInBits()))
22975     return SDValue();
22976 
22977   // Split down to 128/256-bit vector.
22978   unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22979   while (VT.getSizeInBits() > TestSize) {
22980     auto Split = DAG.SplitVector(V, DL);
22981     VT = Split.first.getValueType();
22982     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22983   }
22984 
22985   bool UsePTEST = Subtarget.hasSSE41();
22986   if (UsePTEST) {
22987     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22988     V = DAG.getBitcast(TestVT, MaskBits(V));
22989     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22990   }
22991 
22992   // Without PTEST, a masked v2i64 or-reduction is not faster than
22993   // scalarization.
22994   if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
22995     return SDValue();
22996 
22997   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22998   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22999                   getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
23000   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23001   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23002                      DAG.getConstant(0xFFFF, DL, MVT::i32));
23003 }
23004 
23005 // Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
23006 // CMP(MOVMSK(PCMPEQB(X,0))).
23007 static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
23008                                       const SDLoc &DL,
23009                                       const X86Subtarget &Subtarget,
23010                                       SelectionDAG &DAG, SDValue &X86CC) {
23011   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23012 
23013   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23014     return SDValue();
23015 
23016   // Check whether we're masking/truncating an OR-reduction result, in which
23017   // case track the masked bits.
23018   APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23019   switch (Op.getOpcode()) {
23020   case ISD::TRUNCATE: {
23021     SDValue Src = Op.getOperand(0);
23022     Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23023                                 Op.getScalarValueSizeInBits());
23024     Op = Src;
23025     break;
23026   }
23027   case ISD::AND: {
23028     if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23029       Mask = Cst->getAPIntValue();
23030       Op = Op.getOperand(0);
23031     }
23032     break;
23033   }
23034   }
23035 
23036   SmallVector<SDValue, 8> VecIns;
23037   if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
23038     EVT VT = VecIns[0].getValueType();
23039     assert(llvm::all_of(VecIns,
23040                         [VT](SDValue V) { return VT == V.getValueType(); }) &&
23041            "Reduction source vector mismatch");
23042 
23043     // Quit if less than 128-bits or not splittable to 128/256-bit vector.
23044     if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
23045       return SDValue();
23046 
23047     // If more than one full vector is evaluated, OR them first before PTEST.
23048     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23049          Slot += 2, e += 1) {
23050       // Each iteration will OR 2 nodes and append the result until there is
23051       // only 1 node left, i.e. the final OR'd value of all vectors.
23052       SDValue LHS = VecIns[Slot];
23053       SDValue RHS = VecIns[Slot + 1];
23054       VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
23055     }
23056 
23057     X86::CondCode CCode;
23058     if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
23059                                        DAG, CCode)) {
23060       X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
23061       return V;
23062     }
23063   }
23064 
23065   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23066     ISD::NodeType BinOp;
23067     if (SDValue Match =
23068             DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
23069       X86::CondCode CCode;
23070       if (SDValue V =
23071               LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
23072         X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
23073         return V;
23074       }
23075     }
23076   }
23077 
23078   return SDValue();
23079 }
23080 
23081 /// return true if \c Op has a use that doesn't just read flags.
23082 static bool hasNonFlagsUse(SDValue Op) {
23083   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
23084        ++UI) {
23085     SDNode *User = *UI;
23086     unsigned UOpNo = UI.getOperandNo();
23087     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23088       // Look pass truncate.
23089       UOpNo = User->use_begin().getOperandNo();
23090       User = *User->use_begin();
23091     }
23092 
23093     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23094         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23095       return true;
23096   }
23097   return false;
23098 }
23099 
23100 // Transform to an x86-specific ALU node with flags if there is a chance of
23101 // using an RMW op or only the flags are used. Otherwise, leave
23102 // the node alone and emit a 'cmp' or 'test' instruction.
23103 static bool isProfitableToUseFlagOp(SDValue Op) {
23104   for (SDNode *U : Op->uses())
23105     if (U->getOpcode() != ISD::CopyToReg &&
23106         U->getOpcode() != ISD::SETCC &&
23107         U->getOpcode() != ISD::STORE)
23108       return false;
23109 
23110   return true;
23111 }
23112 
23113 /// Emit nodes that will be selected as "test Op0,Op0", or something
23114 /// equivalent.
23115 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
23116                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23117   // CF and OF aren't always set the way we want. Determine which
23118   // of these we need.
23119   bool NeedCF = false;
23120   bool NeedOF = false;
23121   switch (X86CC) {
23122   default: break;
23123   case X86::COND_A: case X86::COND_AE:
23124   case X86::COND_B: case X86::COND_BE:
23125     NeedCF = true;
23126     break;
23127   case X86::COND_G: case X86::COND_GE:
23128   case X86::COND_L: case X86::COND_LE:
23129   case X86::COND_O: case X86::COND_NO: {
23130     // Check if we really need to set the
23131     // Overflow flag. If NoSignedWrap is present
23132     // that is not actually needed.
23133     switch (Op->getOpcode()) {
23134     case ISD::ADD:
23135     case ISD::SUB:
23136     case ISD::MUL:
23137     case ISD::SHL:
23138       if (Op.getNode()->getFlags().hasNoSignedWrap())
23139         break;
23140       LLVM_FALLTHROUGH;
23141     default:
23142       NeedOF = true;
23143       break;
23144     }
23145     break;
23146   }
23147   }
23148   // See if we can use the EFLAGS value from the operand instead of
23149   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23150   // we prove that the arithmetic won't overflow, we can't use OF or CF.
23151   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23152     // Emit a CMP with 0, which is the TEST pattern.
23153     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23154                        DAG.getConstant(0, dl, Op.getValueType()));
23155   }
23156   unsigned Opcode = 0;
23157   unsigned NumOperands = 0;
23158 
23159   SDValue ArithOp = Op;
23160 
23161   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23162   // which may be the result of a CAST.  We use the variable 'Op', which is the
23163   // non-casted variable when we check for possible users.
23164   switch (ArithOp.getOpcode()) {
23165   case ISD::AND:
23166     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23167     // because a TEST instruction will be better.
23168     if (!hasNonFlagsUse(Op))
23169       break;
23170 
23171     LLVM_FALLTHROUGH;
23172   case ISD::ADD:
23173   case ISD::SUB:
23174   case ISD::OR:
23175   case ISD::XOR:
23176     if (!isProfitableToUseFlagOp(Op))
23177       break;
23178 
23179     // Otherwise use a regular EFLAGS-setting instruction.
23180     switch (ArithOp.getOpcode()) {
23181     default: llvm_unreachable("unexpected operator!");
23182     case ISD::ADD: Opcode = X86ISD::ADD; break;
23183     case ISD::SUB: Opcode = X86ISD::SUB; break;
23184     case ISD::XOR: Opcode = X86ISD::XOR; break;
23185     case ISD::AND: Opcode = X86ISD::AND; break;
23186     case ISD::OR:  Opcode = X86ISD::OR;  break;
23187     }
23188 
23189     NumOperands = 2;
23190     break;
23191   case X86ISD::ADD:
23192   case X86ISD::SUB:
23193   case X86ISD::OR:
23194   case X86ISD::XOR:
23195   case X86ISD::AND:
23196     return SDValue(Op.getNode(), 1);
23197   case ISD::SSUBO:
23198   case ISD::USUBO: {
23199     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23200     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23201     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23202                        Op->getOperand(1)).getValue(1);
23203   }
23204   default:
23205     break;
23206   }
23207 
23208   if (Opcode == 0) {
23209     // Emit a CMP with 0, which is the TEST pattern.
23210     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23211                        DAG.getConstant(0, dl, Op.getValueType()));
23212   }
23213   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23214   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23215 
23216   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23217   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23218   return SDValue(New.getNode(), 1);
23219 }
23220 
23221 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
23222 /// equivalent.
23223 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23224                        const SDLoc &dl, SelectionDAG &DAG,
23225                        const X86Subtarget &Subtarget) {
23226   if (isNullConstant(Op1))
23227     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23228 
23229   EVT CmpVT = Op0.getValueType();
23230 
23231   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23232           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23233 
23234   // Only promote the compare up to I32 if it is a 16 bit operation
23235   // with an immediate.  16 bit immediates are to be avoided.
23236   if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
23237       !DAG.getMachineFunction().getFunction().hasMinSize()) {
23238     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
23239     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
23240     // Don't do this if the immediate can fit in 8-bits.
23241     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23242         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23243       unsigned ExtendOp =
23244           isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23245       if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23246         // For equality comparisons try to use SIGN_EXTEND if the input was
23247         // truncate from something with enough sign bits.
23248         if (Op0.getOpcode() == ISD::TRUNCATE) {
23249           if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23250             ExtendOp = ISD::SIGN_EXTEND;
23251         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23252           if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23253             ExtendOp = ISD::SIGN_EXTEND;
23254         }
23255       }
23256 
23257       CmpVT = MVT::i32;
23258       Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23259       Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23260     }
23261   }
23262 
23263   // Try to shrink i64 compares if the input has enough zero bits.
23264   // FIXME: Do this for non-constant compares for constant on LHS?
23265   if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
23266       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23267       cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
23268       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23269     CmpVT = MVT::i32;
23270     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23271     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23272   }
23273 
23274   // 0-x == y --> x+y == 0
23275   // 0-x != y --> x+y != 0
23276   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23277       Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23278     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23279     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23280     return Add.getValue(1);
23281   }
23282 
23283   // x == 0-y --> x+y == 0
23284   // x != 0-y --> x+y != 0
23285   if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23286       Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23287     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23288     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23289     return Add.getValue(1);
23290   }
23291 
23292   // Use SUB instead of CMP to enable CSE between SUB and CMP.
23293   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23294   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23295   return Sub.getValue(1);
23296 }
23297 
23298 /// Check if replacement of SQRT with RSQRT should be disabled.
23299 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23300   EVT VT = Op.getValueType();
23301 
23302   // We don't need to replace SQRT with RSQRT for half type.
23303   if (VT.getScalarType() == MVT::f16)
23304     return true;
23305 
23306   // We never want to use both SQRT and RSQRT instructions for the same input.
23307   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23308     return false;
23309 
23310   if (VT.isVector())
23311     return Subtarget.hasFastVectorFSQRT();
23312   return Subtarget.hasFastScalarFSQRT();
23313 }
23314 
23315 /// The minimum architected relative accuracy is 2^-12. We need one
23316 /// Newton-Raphson step to have a good float result (24 bits of precision).
23317 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23318                                            SelectionDAG &DAG, int Enabled,
23319                                            int &RefinementSteps,
23320                                            bool &UseOneConstNR,
23321                                            bool Reciprocal) const {
23322   SDLoc DL(Op);
23323   EVT VT = Op.getValueType();
23324 
23325   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23326   // It is likely not profitable to do this for f64 because a double-precision
23327   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23328   // instructions: convert to single, rsqrtss, convert back to double, refine
23329   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23330   // along with FMA, this could be a throughput win.
23331   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23332   // after legalize types.
23333   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23334       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23335       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23336       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23337       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23338     if (RefinementSteps == ReciprocalEstimate::Unspecified)
23339       RefinementSteps = 1;
23340 
23341     UseOneConstNR = false;
23342     // There is no FSQRT for 512-bits, but there is RSQRT14.
23343     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23344     SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23345     if (RefinementSteps == 0 && !Reciprocal)
23346       Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23347     return Estimate;
23348   }
23349 
23350   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23351       Subtarget.hasFP16()) {
23352     assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23353     if (RefinementSteps == ReciprocalEstimate::Unspecified)
23354       RefinementSteps = 0;
23355 
23356     if (VT == MVT::f16) {
23357       SDValue Zero = DAG.getIntPtrConstant(0, DL);
23358       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23359       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23360       Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23361       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23362     }
23363 
23364     return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23365   }
23366   return SDValue();
23367 }
23368 
23369 /// The minimum architected relative accuracy is 2^-12. We need one
23370 /// Newton-Raphson step to have a good float result (24 bits of precision).
23371 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23372                                             int Enabled,
23373                                             int &RefinementSteps) const {
23374   SDLoc DL(Op);
23375   EVT VT = Op.getValueType();
23376 
23377   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23378   // It is likely not profitable to do this for f64 because a double-precision
23379   // reciprocal estimate with refinement on x86 prior to FMA requires
23380   // 15 instructions: convert to single, rcpss, convert back to double, refine
23381   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23382   // along with FMA, this could be a throughput win.
23383 
23384   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23385       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23386       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23387       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23388     // Enable estimate codegen with 1 refinement step for vector division.
23389     // Scalar division estimates are disabled because they break too much
23390     // real-world code. These defaults are intended to match GCC behavior.
23391     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23392       return SDValue();
23393 
23394     if (RefinementSteps == ReciprocalEstimate::Unspecified)
23395       RefinementSteps = 1;
23396 
23397     // There is no FSQRT for 512-bits, but there is RCP14.
23398     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23399     return DAG.getNode(Opcode, DL, VT, Op);
23400   }
23401 
23402   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23403       Subtarget.hasFP16()) {
23404     if (RefinementSteps == ReciprocalEstimate::Unspecified)
23405       RefinementSteps = 0;
23406 
23407     if (VT == MVT::f16) {
23408       SDValue Zero = DAG.getIntPtrConstant(0, DL);
23409       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23410       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23411       Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23412       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23413     }
23414 
23415     return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23416   }
23417   return SDValue();
23418 }
23419 
23420 /// If we have at least two divisions that use the same divisor, convert to
23421 /// multiplication by a reciprocal. This may need to be adjusted for a given
23422 /// CPU if a division's cost is not at least twice the cost of a multiplication.
23423 /// This is because we still need one division to calculate the reciprocal and
23424 /// then we need two multiplies by that reciprocal as replacements for the
23425 /// original divisions.
23426 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23427   return 2;
23428 }
23429 
23430 SDValue
23431 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23432                                  SelectionDAG &DAG,
23433                                  SmallVectorImpl<SDNode *> &Created) const {
23434   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23435   if (isIntDivCheap(N->getValueType(0), Attr))
23436     return SDValue(N,0); // Lower SDIV as SDIV
23437 
23438   assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23439          "Unexpected divisor!");
23440 
23441   // Only perform this transform if CMOV is supported otherwise the select
23442   // below will become a branch.
23443   if (!Subtarget.hasCMov())
23444     return SDValue();
23445 
23446   // fold (sdiv X, pow2)
23447   EVT VT = N->getValueType(0);
23448   // FIXME: Support i8.
23449   if (VT != MVT::i16 && VT != MVT::i32 &&
23450       !(Subtarget.is64Bit() && VT == MVT::i64))
23451     return SDValue();
23452 
23453   unsigned Lg2 = Divisor.countTrailingZeros();
23454 
23455   // If the divisor is 2 or -2, the default expansion is better.
23456   if (Lg2 == 1)
23457     return SDValue();
23458 
23459   SDLoc DL(N);
23460   SDValue N0 = N->getOperand(0);
23461   SDValue Zero = DAG.getConstant(0, DL, VT);
23462   APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
23463   SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
23464 
23465   // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
23466   SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
23467   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
23468   SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
23469 
23470   Created.push_back(Cmp.getNode());
23471   Created.push_back(Add.getNode());
23472   Created.push_back(CMov.getNode());
23473 
23474   // Divide by pow2.
23475   SDValue SRA =
23476       DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
23477 
23478   // If we're dividing by a positive value, we're done.  Otherwise, we must
23479   // negate the result.
23480   if (Divisor.isNonNegative())
23481     return SRA;
23482 
23483   Created.push_back(SRA.getNode());
23484   return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
23485 }
23486 
23487 /// Result of 'and' is compared against zero. Change to a BT node if possible.
23488 /// Returns the BT node and the condition code needed to use it.
23489 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
23490                             const SDLoc &dl, SelectionDAG &DAG,
23491                             SDValue &X86CC) {
23492   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23493   SDValue Op0 = And.getOperand(0);
23494   SDValue Op1 = And.getOperand(1);
23495   if (Op0.getOpcode() == ISD::TRUNCATE)
23496     Op0 = Op0.getOperand(0);
23497   if (Op1.getOpcode() == ISD::TRUNCATE)
23498     Op1 = Op1.getOperand(0);
23499 
23500   SDValue Src, BitNo;
23501   if (Op1.getOpcode() == ISD::SHL)
23502     std::swap(Op0, Op1);
23503   if (Op0.getOpcode() == ISD::SHL) {
23504     if (isOneConstant(Op0.getOperand(0))) {
23505       // If we looked past a truncate, check that it's only truncating away
23506       // known zeros.
23507       unsigned BitWidth = Op0.getValueSizeInBits();
23508       unsigned AndBitWidth = And.getValueSizeInBits();
23509       if (BitWidth > AndBitWidth) {
23510         KnownBits Known = DAG.computeKnownBits(Op0);
23511         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23512           return SDValue();
23513       }
23514       Src = Op1;
23515       BitNo = Op0.getOperand(1);
23516     }
23517   } else if (Op1.getOpcode() == ISD::Constant) {
23518     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23519     uint64_t AndRHSVal = AndRHS->getZExtValue();
23520     SDValue AndLHS = Op0;
23521 
23522     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23523       Src = AndLHS.getOperand(0);
23524       BitNo = AndLHS.getOperand(1);
23525     } else {
23526       // Use BT if the immediate can't be encoded in a TEST instruction or we
23527       // are optimizing for size and the immedaite won't fit in a byte.
23528       bool OptForSize = DAG.shouldOptForSize();
23529       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23530           isPowerOf2_64(AndRHSVal)) {
23531         Src = AndLHS;
23532         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23533                                 Src.getValueType());
23534       }
23535     }
23536   }
23537 
23538   // No patterns found, give up.
23539   if (!Src.getNode())
23540     return SDValue();
23541 
23542   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
23543   // instruction.  Since the shift amount is in-range-or-undefined, we know
23544   // that doing a bittest on the i32 value is ok.  We extend to i32 because
23545   // the encoding for the i16 version is larger than the i32 version.
23546   // Also promote i16 to i32 for performance / code size reason.
23547   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
23548     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
23549 
23550   // See if we can use the 32-bit instruction instead of the 64-bit one for a
23551   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
23552   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
23553   // known to be zero.
23554   if (Src.getValueType() == MVT::i64 &&
23555       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
23556     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
23557 
23558   // If the operand types disagree, extend the shift amount to match.  Since
23559   // BT ignores high bits (like shifts) we can use anyextend.
23560   if (Src.getValueType() != BitNo.getValueType())
23561     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
23562 
23563   X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
23564                                 dl, MVT::i8);
23565   return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
23566 }
23567 
23568 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23569 /// CMPs.
23570 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23571                                    SDValue &Op1, bool &IsAlwaysSignaling) {
23572   unsigned SSECC;
23573   bool Swap = false;
23574 
23575   // SSE Condition code mapping:
23576   //  0 - EQ
23577   //  1 - LT
23578   //  2 - LE
23579   //  3 - UNORD
23580   //  4 - NEQ
23581   //  5 - NLT
23582   //  6 - NLE
23583   //  7 - ORD
23584   switch (SetCCOpcode) {
23585   default: llvm_unreachable("Unexpected SETCC condition");
23586   case ISD::SETOEQ:
23587   case ISD::SETEQ:  SSECC = 0; break;
23588   case ISD::SETOGT:
23589   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
23590   case ISD::SETLT:
23591   case ISD::SETOLT: SSECC = 1; break;
23592   case ISD::SETOGE:
23593   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
23594   case ISD::SETLE:
23595   case ISD::SETOLE: SSECC = 2; break;
23596   case ISD::SETUO:  SSECC = 3; break;
23597   case ISD::SETUNE:
23598   case ISD::SETNE:  SSECC = 4; break;
23599   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
23600   case ISD::SETUGE: SSECC = 5; break;
23601   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
23602   case ISD::SETUGT: SSECC = 6; break;
23603   case ISD::SETO:   SSECC = 7; break;
23604   case ISD::SETUEQ: SSECC = 8; break;
23605   case ISD::SETONE: SSECC = 12; break;
23606   }
23607   if (Swap)
23608     std::swap(Op0, Op1);
23609 
23610   switch (SetCCOpcode) {
23611   default:
23612     IsAlwaysSignaling = true;
23613     break;
23614   case ISD::SETEQ:
23615   case ISD::SETOEQ:
23616   case ISD::SETUEQ:
23617   case ISD::SETNE:
23618   case ISD::SETONE:
23619   case ISD::SETUNE:
23620   case ISD::SETO:
23621   case ISD::SETUO:
23622     IsAlwaysSignaling = false;
23623     break;
23624   }
23625 
23626   return SSECC;
23627 }
23628 
23629 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23630 /// concatenate the result back.
23631 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
23632                               ISD::CondCode Cond, SelectionDAG &DAG,
23633                               const SDLoc &dl) {
23634   assert(VT.isInteger() && VT == LHS.getValueType() &&
23635          VT == RHS.getValueType() && "Unsupported VTs!");
23636 
23637   SDValue CC = DAG.getCondCode(Cond);
23638 
23639   // Extract the LHS Lo/Hi vectors
23640   SDValue LHS1, LHS2;
23641   std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23642 
23643   // Extract the RHS Lo/Hi vectors
23644   SDValue RHS1, RHS2;
23645   std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23646 
23647   // Issue the operation on the smaller types and concatenate the result back
23648   EVT LoVT, HiVT;
23649   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23650   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23651                      DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23652                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23653 }
23654 
23655 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
23656 
23657   SDValue Op0 = Op.getOperand(0);
23658   SDValue Op1 = Op.getOperand(1);
23659   SDValue CC = Op.getOperand(2);
23660   MVT VT = Op.getSimpleValueType();
23661   SDLoc dl(Op);
23662 
23663   assert(VT.getVectorElementType() == MVT::i1 &&
23664          "Cannot set masked compare for this operation");
23665 
23666   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23667 
23668   // Prefer SETGT over SETLT.
23669   if (SetCCOpcode == ISD::SETLT) {
23670     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23671     std::swap(Op0, Op1);
23672   }
23673 
23674   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23675 }
23676 
23677 /// Given a buildvector constant, return a new vector constant with each element
23678 /// incremented or decremented. If incrementing or decrementing would result in
23679 /// unsigned overflow or underflow or this is not a simple vector constant,
23680 /// return an empty value.
23681 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
23682   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23683   if (!BV)
23684     return SDValue();
23685 
23686   MVT VT = V.getSimpleValueType();
23687   MVT EltVT = VT.getVectorElementType();
23688   unsigned NumElts = VT.getVectorNumElements();
23689   SmallVector<SDValue, 8> NewVecC;
23690   SDLoc DL(V);
23691   for (unsigned i = 0; i < NumElts; ++i) {
23692     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23693     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23694       return SDValue();
23695 
23696     // Avoid overflow/underflow.
23697     const APInt &EltC = Elt->getAPIntValue();
23698     if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23699       return SDValue();
23700 
23701     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23702   }
23703 
23704   return DAG.getBuildVector(VT, DL, NewVecC);
23705 }
23706 
23707 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23708 /// Op0 u<= Op1:
23709 ///   t = psubus Op0, Op1
23710 ///   pcmpeq t, <0..0>
23711 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
23712                                     ISD::CondCode Cond, const SDLoc &dl,
23713                                     const X86Subtarget &Subtarget,
23714                                     SelectionDAG &DAG) {
23715   if (!Subtarget.hasSSE2())
23716     return SDValue();
23717 
23718   MVT VET = VT.getVectorElementType();
23719   if (VET != MVT::i8 && VET != MVT::i16)
23720     return SDValue();
23721 
23722   switch (Cond) {
23723   default:
23724     return SDValue();
23725   case ISD::SETULT: {
23726     // If the comparison is against a constant we can turn this into a
23727     // setule.  With psubus, setule does not require a swap.  This is
23728     // beneficial because the constant in the register is no longer
23729     // destructed as the destination so it can be hoisted out of a loop.
23730     // Only do this pre-AVX since vpcmp* is no longer destructive.
23731     if (Subtarget.hasAVX())
23732       return SDValue();
23733     SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
23734     if (!ULEOp1)
23735       return SDValue();
23736     Op1 = ULEOp1;
23737     break;
23738   }
23739   case ISD::SETUGT: {
23740     // If the comparison is against a constant, we can turn this into a setuge.
23741     // This is beneficial because materializing a constant 0 for the PCMPEQ is
23742     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23743     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23744     SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23745     if (!UGEOp1)
23746       return SDValue();
23747     Op1 = Op0;
23748     Op0 = UGEOp1;
23749     break;
23750   }
23751   // Psubus is better than flip-sign because it requires no inversion.
23752   case ISD::SETUGE:
23753     std::swap(Op0, Op1);
23754     break;
23755   case ISD::SETULE:
23756     break;
23757   }
23758 
23759   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23760   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23761                      DAG.getConstant(0, dl, VT));
23762 }
23763 
23764 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23765                            SelectionDAG &DAG) {
23766   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23767                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23768   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23769   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23770   SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23771   MVT VT = Op->getSimpleValueType(0);
23772   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23773   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23774   SDLoc dl(Op);
23775 
23776   if (isFP) {
23777 #ifndef NDEBUG
23778     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23779     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23780 #endif
23781 
23782     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23783     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23784 
23785     // If we have a strict compare with a vXi1 result and the input is 128/256
23786     // bits we can't use a masked compare unless we have VLX. If we use a wider
23787     // compare like we do for non-strict, we might trigger spurious exceptions
23788     // from the upper elements. Instead emit a AVX compare and convert to mask.
23789     unsigned Opc;
23790     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23791         (!IsStrict || Subtarget.hasVLX() ||
23792          Op0.getSimpleValueType().is512BitVector())) {
23793 #ifndef NDEBUG
23794       unsigned Num = VT.getVectorNumElements();
23795       assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23796 #endif
23797       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23798     } else {
23799       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23800       // The SSE/AVX packed FP comparison nodes are defined with a
23801       // floating-point vector result that matches the operand type. This allows
23802       // them to work with an SSE1 target (integer vector types are not legal).
23803       VT = Op0.getSimpleValueType();
23804     }
23805 
23806     SDValue Cmp;
23807     bool IsAlwaysSignaling;
23808     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23809     if (!Subtarget.hasAVX()) {
23810       // TODO: We could use following steps to handle a quiet compare with
23811       // signaling encodings.
23812       // 1. Get ordered masks from a quiet ISD::SETO
23813       // 2. Use the masks to mask potential unordered elements in operand A, B
23814       // 3. Get the compare results of masked A, B
23815       // 4. Calculating final result using the mask and result from 3
23816       // But currently, we just fall back to scalar operations.
23817       if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23818         return SDValue();
23819 
23820       // Insert an extra signaling instruction to raise exception.
23821       if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23822         SDValue SignalCmp = DAG.getNode(
23823             Opc, dl, {VT, MVT::Other},
23824             {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23825         // FIXME: It seems we need to update the flags of all new strict nodes.
23826         // Otherwise, mayRaiseFPException in MI will return false due to
23827         // NoFPExcept = false by default. However, I didn't find it in other
23828         // patches.
23829         SignalCmp->setFlags(Op->getFlags());
23830         Chain = SignalCmp.getValue(1);
23831       }
23832 
23833       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23834       // emit two comparisons and a logic op to tie them together.
23835       if (SSECC >= 8) {
23836         // LLVM predicate is SETUEQ or SETONE.
23837         unsigned CC0, CC1;
23838         unsigned CombineOpc;
23839         if (Cond == ISD::SETUEQ) {
23840           CC0 = 3; // UNORD
23841           CC1 = 0; // EQ
23842           CombineOpc = X86ISD::FOR;
23843         } else {
23844           assert(Cond == ISD::SETONE);
23845           CC0 = 7; // ORD
23846           CC1 = 4; // NEQ
23847           CombineOpc = X86ISD::FAND;
23848         }
23849 
23850         SDValue Cmp0, Cmp1;
23851         if (IsStrict) {
23852           Cmp0 = DAG.getNode(
23853               Opc, dl, {VT, MVT::Other},
23854               {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23855           Cmp1 = DAG.getNode(
23856               Opc, dl, {VT, MVT::Other},
23857               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23858           Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23859                               Cmp1.getValue(1));
23860         } else {
23861           Cmp0 = DAG.getNode(
23862               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23863           Cmp1 = DAG.getNode(
23864               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23865         }
23866         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23867       } else {
23868         if (IsStrict) {
23869           Cmp = DAG.getNode(
23870               Opc, dl, {VT, MVT::Other},
23871               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23872           Chain = Cmp.getValue(1);
23873         } else
23874           Cmp = DAG.getNode(
23875               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23876       }
23877     } else {
23878       // Handle all other FP comparisons here.
23879       if (IsStrict) {
23880         // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23881         SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23882         Cmp = DAG.getNode(
23883             Opc, dl, {VT, MVT::Other},
23884             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23885         Chain = Cmp.getValue(1);
23886       } else
23887         Cmp = DAG.getNode(
23888             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23889     }
23890 
23891     if (VT.getFixedSizeInBits() >
23892         Op.getSimpleValueType().getFixedSizeInBits()) {
23893       // We emitted a compare with an XMM/YMM result. Finish converting to a
23894       // mask register using a vptestm.
23895       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23896       Cmp = DAG.getBitcast(CastVT, Cmp);
23897       Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23898                          DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23899     } else {
23900       // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23901       // the result type of SETCC. The bitcast is expected to be optimized
23902       // away during combining/isel.
23903       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23904     }
23905 
23906     if (IsStrict)
23907       return DAG.getMergeValues({Cmp, Chain}, dl);
23908 
23909     return Cmp;
23910   }
23911 
23912   assert(!IsStrict && "Strict SETCC only handles FP operands.");
23913 
23914   MVT VTOp0 = Op0.getSimpleValueType();
23915   (void)VTOp0;
23916   assert(VTOp0 == Op1.getSimpleValueType() &&
23917          "Expected operands with same type!");
23918   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23919          "Invalid number of packed elements for source and destination!");
23920 
23921   // The non-AVX512 code below works under the assumption that source and
23922   // destination types are the same.
23923   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23924          "Value types for source and destination must be the same!");
23925 
23926   // The result is boolean, but operands are int/float
23927   if (VT.getVectorElementType() == MVT::i1) {
23928     // In AVX-512 architecture setcc returns mask with i1 elements,
23929     // But there is no compare instruction for i8 and i16 elements in KNL.
23930     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23931            "Unexpected operand type");
23932     return LowerIntVSETCC_AVX512(Op, DAG);
23933   }
23934 
23935   // Lower using XOP integer comparisons.
23936   if (VT.is128BitVector() && Subtarget.hasXOP()) {
23937     // Translate compare code to XOP PCOM compare mode.
23938     unsigned CmpMode = 0;
23939     switch (Cond) {
23940     default: llvm_unreachable("Unexpected SETCC condition");
23941     case ISD::SETULT:
23942     case ISD::SETLT: CmpMode = 0x00; break;
23943     case ISD::SETULE:
23944     case ISD::SETLE: CmpMode = 0x01; break;
23945     case ISD::SETUGT:
23946     case ISD::SETGT: CmpMode = 0x02; break;
23947     case ISD::SETUGE:
23948     case ISD::SETGE: CmpMode = 0x03; break;
23949     case ISD::SETEQ: CmpMode = 0x04; break;
23950     case ISD::SETNE: CmpMode = 0x05; break;
23951     }
23952 
23953     // Are we comparing unsigned or signed integers?
23954     unsigned Opc =
23955         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23956 
23957     return DAG.getNode(Opc, dl, VT, Op0, Op1,
23958                        DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23959   }
23960 
23961   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23962   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23963   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23964     SDValue BC0 = peekThroughBitcasts(Op0);
23965     if (BC0.getOpcode() == ISD::AND) {
23966       APInt UndefElts;
23967       SmallVector<APInt, 64> EltBits;
23968       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23969                                         VT.getScalarSizeInBits(), UndefElts,
23970                                         EltBits, false, false)) {
23971         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23972           Cond = ISD::SETEQ;
23973           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23974         }
23975       }
23976     }
23977   }
23978 
23979   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23980   if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23981       Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23982     ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23983     if (C1 && C1->getAPIntValue().isPowerOf2()) {
23984       unsigned BitWidth = VT.getScalarSizeInBits();
23985       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23986 
23987       SDValue Result = Op0.getOperand(0);
23988       Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23989                            DAG.getConstant(ShiftAmt, dl, VT));
23990       Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23991                            DAG.getConstant(BitWidth - 1, dl, VT));
23992       return Result;
23993     }
23994   }
23995 
23996   // Break 256-bit integer vector compare into smaller ones.
23997   if (VT.is256BitVector() && !Subtarget.hasInt256())
23998     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23999 
24000   if (VT == MVT::v32i16 || VT == MVT::v64i8) {
24001     assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
24002     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24003   }
24004 
24005   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24006   // not-of-PCMPEQ:
24007   // X != INT_MIN --> X >s INT_MIN
24008   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24009   // +X != 0 --> +X >s 0
24010   APInt ConstValue;
24011   if (Cond == ISD::SETNE &&
24012       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24013     if (ConstValue.isMinSignedValue())
24014       Cond = ISD::SETGT;
24015     else if (ConstValue.isMaxSignedValue())
24016       Cond = ISD::SETLT;
24017     else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24018       Cond = ISD::SETGT;
24019   }
24020 
24021   // If both operands are known non-negative, then an unsigned compare is the
24022   // same as a signed compare and there's no need to flip signbits.
24023   // TODO: We could check for more general simplifications here since we're
24024   // computing known bits.
24025   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24026                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24027 
24028   // Special case: Use min/max operations for unsigned compares.
24029   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24030   if (ISD::isUnsignedIntSetCC(Cond) &&
24031       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24032       TLI.isOperationLegal(ISD::UMIN, VT)) {
24033     // If we have a constant operand, increment/decrement it and change the
24034     // condition to avoid an invert.
24035     if (Cond == ISD::SETUGT) {
24036       // X > C --> X >= (C+1) --> X == umax(X, C+1)
24037       if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
24038         Op1 = UGTOp1;
24039         Cond = ISD::SETUGE;
24040       }
24041     }
24042     if (Cond == ISD::SETULT) {
24043       // X < C --> X <= (C-1) --> X == umin(X, C-1)
24044       if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
24045         Op1 = ULTOp1;
24046         Cond = ISD::SETULE;
24047       }
24048     }
24049     bool Invert = false;
24050     unsigned Opc;
24051     switch (Cond) {
24052     default: llvm_unreachable("Unexpected condition code");
24053     case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
24054     case ISD::SETULE: Opc = ISD::UMIN; break;
24055     case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
24056     case ISD::SETUGE: Opc = ISD::UMAX; break;
24057     }
24058 
24059     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24060     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24061 
24062     // If the logical-not of the result is required, perform that now.
24063     if (Invert)
24064       Result = DAG.getNOT(dl, Result, VT);
24065 
24066     return Result;
24067   }
24068 
24069   // Try to use SUBUS and PCMPEQ.
24070   if (FlipSigns)
24071     if (SDValue V =
24072             LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24073       return V;
24074 
24075   // We are handling one of the integer comparisons here. Since SSE only has
24076   // GT and EQ comparisons for integer, swapping operands and multiple
24077   // operations may be required for some comparisons.
24078   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24079                                                             : X86ISD::PCMPGT;
24080   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24081               Cond == ISD::SETGE || Cond == ISD::SETUGE;
24082   bool Invert = Cond == ISD::SETNE ||
24083                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
24084 
24085   if (Swap)
24086     std::swap(Op0, Op1);
24087 
24088   // Check that the operation in question is available (most are plain SSE2,
24089   // but PCMPGTQ and PCMPEQQ have different requirements).
24090   if (VT == MVT::v2i64) {
24091     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24092       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24093 
24094       // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24095       // the odd elements over the even elements.
24096       if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24097         Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24098         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24099 
24100         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24101         static const int MaskHi[] = { 1, 1, 3, 3 };
24102         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24103 
24104         return DAG.getBitcast(VT, Result);
24105       }
24106 
24107       if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24108         Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24109         Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
24110 
24111         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24112         static const int MaskHi[] = { 1, 1, 3, 3 };
24113         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24114 
24115         return DAG.getBitcast(VT, Result);
24116       }
24117 
24118       // Since SSE has no unsigned integer comparisons, we need to flip the sign
24119       // bits of the inputs before performing those operations. The lower
24120       // compare is always unsigned.
24121       SDValue SB;
24122       if (FlipSigns) {
24123         SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
24124       } else {
24125         SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
24126       }
24127       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24128       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24129 
24130       // Cast everything to the right type.
24131       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24132       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24133 
24134       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24135       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24136       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24137 
24138       // Create masks for only the low parts/high parts of the 64 bit integers.
24139       static const int MaskHi[] = { 1, 1, 3, 3 };
24140       static const int MaskLo[] = { 0, 0, 2, 2 };
24141       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24142       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24143       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24144 
24145       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24146       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24147 
24148       if (Invert)
24149         Result = DAG.getNOT(dl, Result, MVT::v4i32);
24150 
24151       return DAG.getBitcast(VT, Result);
24152     }
24153 
24154     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24155       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24156       // pcmpeqd + pshufd + pand.
24157       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24158 
24159       // First cast everything to the right type.
24160       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24161       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24162 
24163       // Do the compare.
24164       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24165 
24166       // Make sure the lower and upper halves are both all-ones.
24167       static const int Mask[] = { 1, 0, 3, 2 };
24168       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24169       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24170 
24171       if (Invert)
24172         Result = DAG.getNOT(dl, Result, MVT::v4i32);
24173 
24174       return DAG.getBitcast(VT, Result);
24175     }
24176   }
24177 
24178   // Since SSE has no unsigned integer comparisons, we need to flip the sign
24179   // bits of the inputs before performing those operations.
24180   if (FlipSigns) {
24181     MVT EltVT = VT.getVectorElementType();
24182     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
24183                                  VT);
24184     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24185     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24186   }
24187 
24188   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24189 
24190   // If the logical-not of the result is required, perform that now.
24191   if (Invert)
24192     Result = DAG.getNOT(dl, Result, VT);
24193 
24194   return Result;
24195 }
24196 
24197 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24198 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
24199                               const SDLoc &dl, SelectionDAG &DAG,
24200                               const X86Subtarget &Subtarget,
24201                               SDValue &X86CC) {
24202   // Only support equality comparisons.
24203   if (CC != ISD::SETEQ && CC != ISD::SETNE)
24204     return SDValue();
24205 
24206   // Must be a bitcast from vXi1.
24207   if (Op0.getOpcode() != ISD::BITCAST)
24208     return SDValue();
24209 
24210   Op0 = Op0.getOperand(0);
24211   MVT VT = Op0.getSimpleValueType();
24212   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24213       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24214       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24215     return SDValue();
24216 
24217   X86::CondCode X86Cond;
24218   if (isNullConstant(Op1)) {
24219     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24220   } else if (isAllOnesConstant(Op1)) {
24221     // C flag is set for all ones.
24222     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24223   } else
24224     return SDValue();
24225 
24226   // If the input is an AND, we can combine it's operands into the KTEST.
24227   bool KTestable = false;
24228   if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24229     KTestable = true;
24230   if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24231     KTestable = true;
24232   if (!isNullConstant(Op1))
24233     KTestable = false;
24234   if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24235     SDValue LHS = Op0.getOperand(0);
24236     SDValue RHS = Op0.getOperand(1);
24237     X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24238     return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24239   }
24240 
24241   // If the input is an OR, we can combine it's operands into the KORTEST.
24242   SDValue LHS = Op0;
24243   SDValue RHS = Op0;
24244   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24245     LHS = Op0.getOperand(0);
24246     RHS = Op0.getOperand(1);
24247   }
24248 
24249   X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24250   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24251 }
24252 
24253 /// Emit flags for the given setcc condition and operands. Also returns the
24254 /// corresponding X86 condition code constant in X86CC.
24255 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24256                                              ISD::CondCode CC, const SDLoc &dl,
24257                                              SelectionDAG &DAG,
24258                                              SDValue &X86CC) const {
24259   // Optimize to BT if possible.
24260   // Lower (X & (1 << N)) == 0 to BT(X, N).
24261   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24262   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24263   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
24264       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24265     if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
24266       return BT;
24267   }
24268 
24269   // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
24270   // TODO: We could do AND tree with all 1s as well by using the C flag.
24271   if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
24272     if (SDValue CmpZ =
24273             MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
24274       return CmpZ;
24275 
24276   // Try to lower using KORTEST or KTEST.
24277   if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24278     return Test;
24279 
24280   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
24281   // these.
24282   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
24283       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24284     // If the input is a setcc, then reuse the input setcc or use a new one with
24285     // the inverted condition.
24286     if (Op0.getOpcode() == X86ISD::SETCC) {
24287       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24288 
24289       X86CC = Op0.getOperand(0);
24290       if (Invert) {
24291         X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24292         CCode = X86::GetOppositeBranchCondition(CCode);
24293         X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24294       }
24295 
24296       return Op0.getOperand(1);
24297     }
24298   }
24299 
24300   // Try to use the carry flag from the add in place of an separate CMP for:
24301   // (seteq (add X, -1), -1). Similar for setne.
24302   if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24303       Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24304     if (isProfitableToUseFlagOp(Op0)) {
24305       SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24306 
24307       SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24308                                 Op0.getOperand(1));
24309       DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24310       X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24311       X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24312       return SDValue(New.getNode(), 1);
24313     }
24314   }
24315 
24316   X86::CondCode CondCode =
24317       TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24318   assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24319 
24320   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24321   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24322   return EFLAGS;
24323 }
24324 
24325 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24326 
24327   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24328                   Op.getOpcode() == ISD::STRICT_FSETCCS;
24329   MVT VT = Op->getSimpleValueType(0);
24330 
24331   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24332 
24333   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24334   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24335   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24336   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24337   SDLoc dl(Op);
24338   ISD::CondCode CC =
24339       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24340 
24341   // Handle f128 first, since one possible outcome is a normal integer
24342   // comparison which gets handled by emitFlagsForSetcc.
24343   if (Op0.getValueType() == MVT::f128) {
24344     softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24345                         Op.getOpcode() == ISD::STRICT_FSETCCS);
24346 
24347     // If softenSetCCOperands returned a scalar, use it.
24348     if (!Op1.getNode()) {
24349       assert(Op0.getValueType() == Op.getValueType() &&
24350              "Unexpected setcc expansion!");
24351       if (IsStrict)
24352         return DAG.getMergeValues({Op0, Chain}, dl);
24353       return Op0;
24354     }
24355   }
24356 
24357   if (Op0.getSimpleValueType().isInteger()) {
24358     // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24359     // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24360     // this may translate to less uops depending on uarch implementation. The
24361     // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24362     // canonicalize to that CondCode.
24363     // NOTE: Only do this if incrementing the constant doesn't increase the bit
24364     // encoding size - so it must either already be a i8 or i32 immediate, or it
24365     // shrinks down to that. We don't do this for any i64's to avoid additional
24366     // constant materializations.
24367     // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24368     if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24369       const APInt &Op1Val = Op1C->getAPIntValue();
24370       if (!Op1Val.isZero()) {
24371         // Ensure the constant+1 doesn't overflow.
24372         if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24373             (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24374           APInt Op1ValPlusOne = Op1Val + 1;
24375           if (Op1ValPlusOne.isSignedIntN(32) &&
24376               (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24377             Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24378             CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
24379                                             : ISD::CondCode::SETUGE;
24380           }
24381         }
24382       }
24383     }
24384 
24385     SDValue X86CC;
24386     SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24387     SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24388     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24389   }
24390 
24391   // Handle floating point.
24392   X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24393   if (CondCode == X86::COND_INVALID)
24394     return SDValue();
24395 
24396   SDValue EFLAGS;
24397   if (IsStrict) {
24398     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24399     EFLAGS =
24400         DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
24401                     dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24402     Chain = EFLAGS.getValue(1);
24403   } else {
24404     EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24405   }
24406 
24407   SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24408   SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24409   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24410 }
24411 
24412 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24413   SDValue LHS = Op.getOperand(0);
24414   SDValue RHS = Op.getOperand(1);
24415   SDValue Carry = Op.getOperand(2);
24416   SDValue Cond = Op.getOperand(3);
24417   SDLoc DL(Op);
24418 
24419   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24420   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24421 
24422   // Recreate the carry if needed.
24423   EVT CarryVT = Carry.getValueType();
24424   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24425                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
24426 
24427   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24428   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24429   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24430 }
24431 
24432 // This function returns three things: the arithmetic computation itself
24433 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
24434 // flag and the condition code define the case in which the arithmetic
24435 // computation overflows.
24436 static std::pair<SDValue, SDValue>
24437 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
24438   assert(Op.getResNo() == 0 && "Unexpected result number!");
24439   SDValue Value, Overflow;
24440   SDValue LHS = Op.getOperand(0);
24441   SDValue RHS = Op.getOperand(1);
24442   unsigned BaseOp = 0;
24443   SDLoc DL(Op);
24444   switch (Op.getOpcode()) {
24445   default: llvm_unreachable("Unknown ovf instruction!");
24446   case ISD::SADDO:
24447     BaseOp = X86ISD::ADD;
24448     Cond = X86::COND_O;
24449     break;
24450   case ISD::UADDO:
24451     BaseOp = X86ISD::ADD;
24452     Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
24453     break;
24454   case ISD::SSUBO:
24455     BaseOp = X86ISD::SUB;
24456     Cond = X86::COND_O;
24457     break;
24458   case ISD::USUBO:
24459     BaseOp = X86ISD::SUB;
24460     Cond = X86::COND_B;
24461     break;
24462   case ISD::SMULO:
24463     BaseOp = X86ISD::SMUL;
24464     Cond = X86::COND_O;
24465     break;
24466   case ISD::UMULO:
24467     BaseOp = X86ISD::UMUL;
24468     Cond = X86::COND_O;
24469     break;
24470   }
24471 
24472   if (BaseOp) {
24473     // Also sets EFLAGS.
24474     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24475     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24476     Overflow = Value.getValue(1);
24477   }
24478 
24479   return std::make_pair(Value, Overflow);
24480 }
24481 
24482 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24483   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24484   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24485   // looks for this combo and may remove the "setcc" instruction if the "setcc"
24486   // has only one use.
24487   SDLoc DL(Op);
24488   X86::CondCode Cond;
24489   SDValue Value, Overflow;
24490   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24491 
24492   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24493   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24494   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24495 }
24496 
24497 /// Return true if opcode is a X86 logical comparison.
24498 static bool isX86LogicalCmp(SDValue Op) {
24499   unsigned Opc = Op.getOpcode();
24500   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24501       Opc == X86ISD::FCMP)
24502     return true;
24503   if (Op.getResNo() == 1 &&
24504       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24505        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24506        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24507     return true;
24508 
24509   return false;
24510 }
24511 
24512 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
24513   if (V.getOpcode() != ISD::TRUNCATE)
24514     return false;
24515 
24516   SDValue VOp0 = V.getOperand(0);
24517   unsigned InBits = VOp0.getValueSizeInBits();
24518   unsigned Bits = V.getValueSizeInBits();
24519   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24520 }
24521 
24522 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24523   bool AddTest = true;
24524   SDValue Cond  = Op.getOperand(0);
24525   SDValue Op1 = Op.getOperand(1);
24526   SDValue Op2 = Op.getOperand(2);
24527   SDLoc DL(Op);
24528   MVT VT = Op1.getSimpleValueType();
24529   SDValue CC;
24530 
24531   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24532   // are available or VBLENDV if AVX is available.
24533   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24534   if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24535       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24536     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24537     bool IsAlwaysSignaling;
24538     unsigned SSECC =
24539         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24540                            CondOp0, CondOp1, IsAlwaysSignaling);
24541 
24542     if (Subtarget.hasAVX512()) {
24543       SDValue Cmp =
24544           DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24545                       DAG.getTargetConstant(SSECC, DL, MVT::i8));
24546       assert(!VT.isVector() && "Not a scalar type?");
24547       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24548     }
24549 
24550     if (SSECC < 8 || Subtarget.hasAVX()) {
24551       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24552                                 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24553 
24554       // If we have AVX, we can use a variable vector select (VBLENDV) instead
24555       // of 3 logic instructions for size savings and potentially speed.
24556       // Unfortunately, there is no scalar form of VBLENDV.
24557 
24558       // If either operand is a +0.0 constant, don't try this. We can expect to
24559       // optimize away at least one of the logic instructions later in that
24560       // case, so that sequence would be faster than a variable blend.
24561 
24562       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24563       // uses XMM0 as the selection register. That may need just as many
24564       // instructions as the AND/ANDN/OR sequence due to register moves, so
24565       // don't bother.
24566       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24567           !isNullFPConstant(Op2)) {
24568         // Convert to vectors, do a VSELECT, and convert back to scalar.
24569         // All of the conversions should be optimized away.
24570         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24571         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24572         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24573         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24574 
24575         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24576         VCmp = DAG.getBitcast(VCmpVT, VCmp);
24577 
24578         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24579 
24580         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24581                            VSel, DAG.getIntPtrConstant(0, DL));
24582       }
24583       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24584       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24585       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24586     }
24587   }
24588 
24589   // AVX512 fallback is to lower selects of scalar floats to masked moves.
24590   if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24591     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24592     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24593   }
24594 
24595   if (Cond.getOpcode() == ISD::SETCC) {
24596     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24597       Cond = NewCond;
24598       // If the condition was updated, it's possible that the operands of the
24599       // select were also updated (for example, EmitTest has a RAUW). Refresh
24600       // the local references to the select operands in case they got stale.
24601       Op1 = Op.getOperand(1);
24602       Op2 = Op.getOperand(2);
24603     }
24604   }
24605 
24606   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24607   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24608   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24609   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24610   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24611   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24612   if (Cond.getOpcode() == X86ISD::SETCC &&
24613       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24614       isNullConstant(Cond.getOperand(1).getOperand(1))) {
24615     SDValue Cmp = Cond.getOperand(1);
24616     SDValue CmpOp0 = Cmp.getOperand(0);
24617     unsigned CondCode = Cond.getConstantOperandVal(0);
24618 
24619     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24620     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24621     // handle to keep the CMP with 0. This should be removed by
24622     // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24623     // cttz_zero_undef.
24624     auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24625       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24626               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24627     };
24628     if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
24629         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24630          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24631       // Keep Cmp.
24632     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24633         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24634       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24635       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24636 
24637       // 'X - 1' sets the carry flag if X == 0.
24638       // '0 - X' sets the carry flag if X != 0.
24639       // Convert the carry flag to a -1/0 mask with sbb:
24640       // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24641       // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24642       // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24643       // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24644       SDValue Sub;
24645       if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24646         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24647         Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24648       } else {
24649         SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24650         Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24651       }
24652       SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24653                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24654                                 Sub.getValue(1));
24655       return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24656     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
24657                Cmp.getOperand(0).getOpcode() == ISD::AND &&
24658                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
24659       SDValue Src1, Src2;
24660       // true if Op2 is XOR or OR operator and one of its operands
24661       // is equal to Op1
24662       // ( a , a op b) || ( b , a op b)
24663       auto isOrXorPattern = [&]() {
24664         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24665             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24666           Src1 =
24667               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24668           Src2 = Op1;
24669           return true;
24670         }
24671         return false;
24672       };
24673 
24674       if (isOrXorPattern()) {
24675         SDValue Neg;
24676         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24677         // we need mask of all zeros or ones with same size of the other
24678         // operands.
24679         if (CmpSz > VT.getSizeInBits())
24680           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24681         else if (CmpSz < VT.getSizeInBits())
24682           Neg = DAG.getNode(ISD::AND, DL, VT,
24683               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24684               DAG.getConstant(1, DL, VT));
24685         else
24686           Neg = CmpOp0;
24687         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
24688                                    Neg); // -(and (x, 0x1))
24689         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24690         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
24691       }
24692     }
24693   }
24694 
24695   // Look past (and (setcc_carry (cmp ...)), 1).
24696   if (Cond.getOpcode() == ISD::AND &&
24697       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24698       isOneConstant(Cond.getOperand(1)))
24699     Cond = Cond.getOperand(0);
24700 
24701   // If condition flag is set by a X86ISD::CMP, then use it as the condition
24702   // setting operand in place of the X86ISD::SETCC.
24703   unsigned CondOpcode = Cond.getOpcode();
24704   if (CondOpcode == X86ISD::SETCC ||
24705       CondOpcode == X86ISD::SETCC_CARRY) {
24706     CC = Cond.getOperand(0);
24707 
24708     SDValue Cmp = Cond.getOperand(1);
24709     bool IllegalFPCMov = false;
24710     if (VT.isFloatingPoint() && !VT.isVector() &&
24711         !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
24712       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24713 
24714     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24715         Cmp.getOpcode() == X86ISD::BT) { // FIXME
24716       Cond = Cmp;
24717       AddTest = false;
24718     }
24719   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24720              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24721              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24722     SDValue Value;
24723     X86::CondCode X86Cond;
24724     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24725 
24726     CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24727     AddTest = false;
24728   }
24729 
24730   if (AddTest) {
24731     // Look past the truncate if the high bits are known zero.
24732     if (isTruncWithZeroHighBitsInput(Cond, DAG))
24733       Cond = Cond.getOperand(0);
24734 
24735     // We know the result of AND is compared against zero. Try to match
24736     // it to BT.
24737     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24738       SDValue BTCC;
24739       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24740         CC = BTCC;
24741         Cond = BT;
24742         AddTest = false;
24743       }
24744     }
24745   }
24746 
24747   if (AddTest) {
24748     CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24749     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24750   }
24751 
24752   // a <  b ? -1 :  0 -> RES = ~setcc_carry
24753   // a <  b ?  0 : -1 -> RES = setcc_carry
24754   // a >= b ? -1 :  0 -> RES = setcc_carry
24755   // a >= b ?  0 : -1 -> RES = ~setcc_carry
24756   if (Cond.getOpcode() == X86ISD::SUB) {
24757     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24758 
24759     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24760         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24761         (isNullConstant(Op1) || isNullConstant(Op2))) {
24762       SDValue Res =
24763           DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24764                       DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24765       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24766         return DAG.getNOT(DL, Res, Res.getValueType());
24767       return Res;
24768     }
24769   }
24770 
24771   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24772   // widen the cmov and push the truncate through. This avoids introducing a new
24773   // branch during isel and doesn't add any extensions.
24774   if (Op.getValueType() == MVT::i8 &&
24775       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24776     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24777     if (T1.getValueType() == T2.getValueType() &&
24778         // Exclude CopyFromReg to avoid partial register stalls.
24779         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24780       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24781                                  CC, Cond);
24782       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24783     }
24784   }
24785 
24786   // Or finally, promote i8 cmovs if we have CMOV,
24787   //                 or i16 cmovs if it won't prevent folding a load.
24788   // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24789   //        legal, but EmitLoweredSelect() can not deal with these extensions
24790   //        being inserted between two CMOV's. (in i16 case too TBN)
24791   //        https://bugs.llvm.org/show_bug.cgi?id=40974
24792   if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24793       (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24794        !X86::mayFoldLoad(Op2, Subtarget))) {
24795     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24796     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24797     SDValue Ops[] = { Op2, Op1, CC, Cond };
24798     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24799     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24800   }
24801 
24802   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24803   // condition is true.
24804   SDValue Ops[] = { Op2, Op1, CC, Cond };
24805   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24806 }
24807 
24808 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24809                                      const X86Subtarget &Subtarget,
24810                                      SelectionDAG &DAG) {
24811   MVT VT = Op->getSimpleValueType(0);
24812   SDValue In = Op->getOperand(0);
24813   MVT InVT = In.getSimpleValueType();
24814   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24815   MVT VTElt = VT.getVectorElementType();
24816   SDLoc dl(Op);
24817 
24818   unsigned NumElts = VT.getVectorNumElements();
24819 
24820   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24821   MVT ExtVT = VT;
24822   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24823     // If v16i32 is to be avoided, we'll need to split and concatenate.
24824     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24825       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24826 
24827     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24828   }
24829 
24830   // Widen to 512-bits if VLX is not supported.
24831   MVT WideVT = ExtVT;
24832   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24833     NumElts *= 512 / ExtVT.getSizeInBits();
24834     InVT = MVT::getVectorVT(MVT::i1, NumElts);
24835     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24836                      In, DAG.getIntPtrConstant(0, dl));
24837     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24838   }
24839 
24840   SDValue V;
24841   MVT WideEltVT = WideVT.getVectorElementType();
24842   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24843       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24844     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24845   } else {
24846     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24847     SDValue Zero = DAG.getConstant(0, dl, WideVT);
24848     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24849   }
24850 
24851   // Truncate if we had to extend i16/i8 above.
24852   if (VT != ExtVT) {
24853     WideVT = MVT::getVectorVT(VTElt, NumElts);
24854     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24855   }
24856 
24857   // Extract back to 128/256-bit if we widened.
24858   if (WideVT != VT)
24859     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24860                     DAG.getIntPtrConstant(0, dl));
24861 
24862   return V;
24863 }
24864 
24865 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24866                                SelectionDAG &DAG) {
24867   SDValue In = Op->getOperand(0);
24868   MVT InVT = In.getSimpleValueType();
24869 
24870   if (InVT.getVectorElementType() == MVT::i1)
24871     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24872 
24873   assert(Subtarget.hasAVX() && "Expected AVX support");
24874   return LowerAVXExtend(Op, DAG, Subtarget);
24875 }
24876 
24877 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24878 // For sign extend this needs to handle all vector sizes and SSE4.1 and
24879 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24880 // MVT::v64i8 when BWI is not supported, but AVX512 is.
24881 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24882                                         const X86Subtarget &Subtarget,
24883                                         SelectionDAG &DAG) {
24884   SDValue In = Op->getOperand(0);
24885   MVT VT = Op->getSimpleValueType(0);
24886   MVT InVT = In.getSimpleValueType();
24887 
24888   MVT SVT = VT.getVectorElementType();
24889   MVT InSVT = InVT.getVectorElementType();
24890   assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
24891 
24892   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24893     return SDValue();
24894   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24895     return SDValue();
24896   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24897       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24898       !(VT.is512BitVector() && Subtarget.hasAVX512()))
24899     return SDValue();
24900 
24901   SDLoc dl(Op);
24902   unsigned Opc = Op.getOpcode();
24903   unsigned NumElts = VT.getVectorNumElements();
24904 
24905   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24906   // For 512-bit vectors, we need 128-bits or 256-bits.
24907   if (InVT.getSizeInBits() > 128) {
24908     // Input needs to be at least the same number of elements as output, and
24909     // at least 128-bits.
24910     int InSize = InSVT.getSizeInBits() * NumElts;
24911     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24912     InVT = In.getSimpleValueType();
24913   }
24914 
24915   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24916   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24917   // need to be handled here for 256/512-bit results.
24918   if (Subtarget.hasInt256()) {
24919     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24920 
24921     if (InVT.getVectorNumElements() != NumElts)
24922       return DAG.getNode(Op.getOpcode(), dl, VT, In);
24923 
24924     // FIXME: Apparently we create inreg operations that could be regular
24925     // extends.
24926     unsigned ExtOpc =
24927         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24928                                              : ISD::ZERO_EXTEND;
24929     return DAG.getNode(ExtOpc, dl, VT, In);
24930   }
24931 
24932   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24933   if (Subtarget.hasAVX()) {
24934     assert(VT.is256BitVector() && "256-bit vector expected");
24935     MVT HalfVT = VT.getHalfNumVectorElementsVT();
24936     int HalfNumElts = HalfVT.getVectorNumElements();
24937 
24938     unsigned NumSrcElts = InVT.getVectorNumElements();
24939     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24940     for (int i = 0; i != HalfNumElts; ++i)
24941       HiMask[i] = HalfNumElts + i;
24942 
24943     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24944     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24945     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24946     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24947   }
24948 
24949   // We should only get here for sign extend.
24950   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24951   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24952 
24953   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24954   SDValue Curr = In;
24955   SDValue SignExt = Curr;
24956 
24957   // As SRAI is only available on i16/i32 types, we expand only up to i32
24958   // and handle i64 separately.
24959   if (InVT != MVT::v4i32) {
24960     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24961 
24962     unsigned DestWidth = DestVT.getScalarSizeInBits();
24963     unsigned Scale = DestWidth / InSVT.getSizeInBits();
24964 
24965     unsigned InNumElts = InVT.getVectorNumElements();
24966     unsigned DestElts = DestVT.getVectorNumElements();
24967 
24968     // Build a shuffle mask that takes each input element and places it in the
24969     // MSBs of the new element size.
24970     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24971     for (unsigned i = 0; i != DestElts; ++i)
24972       Mask[i * Scale + (Scale - 1)] = i;
24973 
24974     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24975     Curr = DAG.getBitcast(DestVT, Curr);
24976 
24977     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24978     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24979                           DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24980   }
24981 
24982   if (VT == MVT::v2i64) {
24983     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24984     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24985     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24986     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24987     SignExt = DAG.getBitcast(VT, SignExt);
24988   }
24989 
24990   return SignExt;
24991 }
24992 
24993 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24994                                 SelectionDAG &DAG) {
24995   MVT VT = Op->getSimpleValueType(0);
24996   SDValue In = Op->getOperand(0);
24997   MVT InVT = In.getSimpleValueType();
24998   SDLoc dl(Op);
24999 
25000   if (InVT.getVectorElementType() == MVT::i1)
25001     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
25002 
25003   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25004   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
25005          "Expected same number of elements");
25006   assert((VT.getVectorElementType() == MVT::i16 ||
25007           VT.getVectorElementType() == MVT::i32 ||
25008           VT.getVectorElementType() == MVT::i64) &&
25009          "Unexpected element type");
25010   assert((InVT.getVectorElementType() == MVT::i8 ||
25011           InVT.getVectorElementType() == MVT::i16 ||
25012           InVT.getVectorElementType() == MVT::i32) &&
25013          "Unexpected element type");
25014 
25015   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25016     assert(InVT == MVT::v32i8 && "Unexpected VT!");
25017     return splitVectorIntUnary(Op, DAG);
25018   }
25019 
25020   if (Subtarget.hasInt256())
25021     return Op;
25022 
25023   // Optimize vectors in AVX mode
25024   // Sign extend  v8i16 to v8i32 and
25025   //              v4i32 to v4i64
25026   //
25027   // Divide input vector into two parts
25028   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25029   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25030   // concat the vectors to original VT
25031   MVT HalfVT = VT.getHalfNumVectorElementsVT();
25032   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25033 
25034   unsigned NumElems = InVT.getVectorNumElements();
25035   SmallVector<int,8> ShufMask(NumElems, -1);
25036   for (unsigned i = 0; i != NumElems/2; ++i)
25037     ShufMask[i] = i + NumElems/2;
25038 
25039   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25040   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25041 
25042   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25043 }
25044 
25045 /// Change a vector store into a pair of half-size vector stores.
25046 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
25047   SDValue StoredVal = Store->getValue();
25048   assert((StoredVal.getValueType().is256BitVector() ||
25049           StoredVal.getValueType().is512BitVector()) &&
25050          "Expecting 256/512-bit op");
25051 
25052   // Splitting volatile memory ops is not allowed unless the operation was not
25053   // legal to begin with. Assume the input store is legal (this transform is
25054   // only used for targets with AVX). Note: It is possible that we have an
25055   // illegal type like v2i128, and so we could allow splitting a volatile store
25056   // in that case if that is important.
25057   if (!Store->isSimple())
25058     return SDValue();
25059 
25060   SDLoc DL(Store);
25061   SDValue Value0, Value1;
25062   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25063   unsigned HalfOffset = Value0.getValueType().getStoreSize();
25064   SDValue Ptr0 = Store->getBasePtr();
25065   SDValue Ptr1 =
25066       DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
25067   SDValue Ch0 =
25068       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25069                    Store->getOriginalAlign(),
25070                    Store->getMemOperand()->getFlags());
25071   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25072                              Store->getPointerInfo().getWithOffset(HalfOffset),
25073                              Store->getOriginalAlign(),
25074                              Store->getMemOperand()->getFlags());
25075   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25076 }
25077 
25078 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25079 /// type.
25080 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
25081                                     SelectionDAG &DAG) {
25082   SDValue StoredVal = Store->getValue();
25083   assert(StoreVT.is128BitVector() &&
25084          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25085   StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25086 
25087   // Splitting volatile memory ops is not allowed unless the operation was not
25088   // legal to begin with. We are assuming the input op is legal (this transform
25089   // is only used for targets with AVX).
25090   if (!Store->isSimple())
25091     return SDValue();
25092 
25093   MVT StoreSVT = StoreVT.getScalarType();
25094   unsigned NumElems = StoreVT.getVectorNumElements();
25095   unsigned ScalarSize = StoreSVT.getStoreSize();
25096 
25097   SDLoc DL(Store);
25098   SmallVector<SDValue, 4> Stores;
25099   for (unsigned i = 0; i != NumElems; ++i) {
25100     unsigned Offset = i * ScalarSize;
25101     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25102                                            TypeSize::Fixed(Offset), DL);
25103     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25104                               DAG.getIntPtrConstant(i, DL));
25105     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25106                               Store->getPointerInfo().getWithOffset(Offset),
25107                               Store->getOriginalAlign(),
25108                               Store->getMemOperand()->getFlags());
25109     Stores.push_back(Ch);
25110   }
25111   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25112 }
25113 
25114 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25115                           SelectionDAG &DAG) {
25116   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25117   SDLoc dl(St);
25118   SDValue StoredVal = St->getValue();
25119 
25120   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25121   if (StoredVal.getValueType().isVector() &&
25122       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25123     unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25124     assert(NumElts <= 8 && "Unexpected VT");
25125     assert(!St->isTruncatingStore() && "Expected non-truncating store");
25126     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25127            "Expected AVX512F without AVX512DQI");
25128 
25129     // We must pad with zeros to ensure we store zeroes to any unused bits.
25130     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25131                             DAG.getUNDEF(MVT::v16i1), StoredVal,
25132                             DAG.getIntPtrConstant(0, dl));
25133     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25134     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25135     // Make sure we store zeros in the extra bits.
25136     if (NumElts < 8)
25137       StoredVal = DAG.getZeroExtendInReg(
25138           StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25139 
25140     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25141                         St->getPointerInfo(), St->getOriginalAlign(),
25142                         St->getMemOperand()->getFlags());
25143   }
25144 
25145   if (St->isTruncatingStore())
25146     return SDValue();
25147 
25148   // If this is a 256-bit store of concatenated ops, we are better off splitting
25149   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25150   // and each half can execute independently. Some cores would split the op into
25151   // halves anyway, so the concat (vinsertf128) is purely an extra op.
25152   MVT StoreVT = StoredVal.getSimpleValueType();
25153   if (StoreVT.is256BitVector() ||
25154       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
25155        !Subtarget.hasBWI())) {
25156     SmallVector<SDValue, 4> CatOps;
25157     if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
25158       return splitVectorStore(St, DAG);
25159     return SDValue();
25160   }
25161 
25162   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25163   assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
25164          "Unexpected VT");
25165   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25166              TargetLowering::TypeWidenVector && "Unexpected type action!");
25167 
25168   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25169   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25170                           DAG.getUNDEF(StoreVT));
25171 
25172   if (Subtarget.hasSSE2()) {
25173     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25174     // and store it.
25175     MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25176     MVT CastVT = MVT::getVectorVT(StVT, 2);
25177     StoredVal = DAG.getBitcast(CastVT, StoredVal);
25178     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25179                             DAG.getIntPtrConstant(0, dl));
25180 
25181     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25182                         St->getPointerInfo(), St->getOriginalAlign(),
25183                         St->getMemOperand()->getFlags());
25184   }
25185   assert(Subtarget.hasSSE1() && "Expected SSE");
25186   SDVTList Tys = DAG.getVTList(MVT::Other);
25187   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25188   return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25189                                  St->getMemOperand());
25190 }
25191 
25192 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
25193 // may emit an illegal shuffle but the expansion is still better than scalar
25194 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25195 // we'll emit a shuffle and a arithmetic shift.
25196 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25197 // TODO: It is possible to support ZExt by zeroing the undef values during
25198 // the shuffle phase or after the shuffle.
25199 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25200                                  SelectionDAG &DAG) {
25201   MVT RegVT = Op.getSimpleValueType();
25202   assert(RegVT.isVector() && "We only custom lower vector loads.");
25203   assert(RegVT.isInteger() &&
25204          "We only custom lower integer vector loads.");
25205 
25206   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25207   SDLoc dl(Ld);
25208 
25209   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25210   if (RegVT.getVectorElementType() == MVT::i1) {
25211     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25212     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25213     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25214            "Expected AVX512F without AVX512DQI");
25215 
25216     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25217                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25218                                 Ld->getMemOperand()->getFlags());
25219 
25220     // Replace chain users with the new chain.
25221     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25222 
25223     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25224     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25225                       DAG.getBitcast(MVT::v16i1, Val),
25226                       DAG.getIntPtrConstant(0, dl));
25227     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25228   }
25229 
25230   return SDValue();
25231 }
25232 
25233 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25234 /// each of which has no other use apart from the AND / OR.
25235 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25236   Opc = Op.getOpcode();
25237   if (Opc != ISD::OR && Opc != ISD::AND)
25238     return false;
25239   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25240           Op.getOperand(0).hasOneUse() &&
25241           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25242           Op.getOperand(1).hasOneUse());
25243 }
25244 
25245 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25246   SDValue Chain = Op.getOperand(0);
25247   SDValue Cond  = Op.getOperand(1);
25248   SDValue Dest  = Op.getOperand(2);
25249   SDLoc dl(Op);
25250 
25251   if (Cond.getOpcode() == ISD::SETCC &&
25252       Cond.getOperand(0).getValueType() != MVT::f128) {
25253     SDValue LHS = Cond.getOperand(0);
25254     SDValue RHS = Cond.getOperand(1);
25255     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25256 
25257     // Special case for
25258     // setcc([su]{add,sub,mul}o == 0)
25259     // setcc([su]{add,sub,mul}o != 1)
25260     if (ISD::isOverflowIntrOpRes(LHS) &&
25261         (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25262         (isNullConstant(RHS) || isOneConstant(RHS))) {
25263       SDValue Value, Overflow;
25264       X86::CondCode X86Cond;
25265       std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25266 
25267       if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25268         X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25269 
25270       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25271       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25272                          Overflow);
25273     }
25274 
25275     if (LHS.getSimpleValueType().isInteger()) {
25276       SDValue CCVal;
25277       SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25278       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25279                          EFLAGS);
25280     }
25281 
25282     if (CC == ISD::SETOEQ) {
25283       // For FCMP_OEQ, we can emit
25284       // two branches instead of an explicit AND instruction with a
25285       // separate test. However, we only do this if this block doesn't
25286       // have a fall-through edge, because this requires an explicit
25287       // jmp when the condition is false.
25288       if (Op.getNode()->hasOneUse()) {
25289         SDNode *User = *Op.getNode()->use_begin();
25290         // Look for an unconditional branch following this conditional branch.
25291         // We need this because we need to reverse the successors in order
25292         // to implement FCMP_OEQ.
25293         if (User->getOpcode() == ISD::BR) {
25294           SDValue FalseBB = User->getOperand(1);
25295           SDNode *NewBR =
25296             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25297           assert(NewBR == User);
25298           (void)NewBR;
25299           Dest = FalseBB;
25300 
25301           SDValue Cmp =
25302               DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25303           SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25304           Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25305                               CCVal, Cmp);
25306           CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25307           return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25308                              Cmp);
25309         }
25310       }
25311     } else if (CC == ISD::SETUNE) {
25312       // For FCMP_UNE, we can emit
25313       // two branches instead of an explicit OR instruction with a
25314       // separate test.
25315       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25316       SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25317       Chain =
25318           DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
25319       CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25320       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25321                          Cmp);
25322     } else {
25323       X86::CondCode X86Cond =
25324           TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25325       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25326       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25327       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25328                          Cmp);
25329     }
25330   }
25331 
25332   if (ISD::isOverflowIntrOpRes(Cond)) {
25333     SDValue Value, Overflow;
25334     X86::CondCode X86Cond;
25335     std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25336 
25337     SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25338     return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25339                        Overflow);
25340   }
25341 
25342   // Look past the truncate if the high bits are known zero.
25343   if (isTruncWithZeroHighBitsInput(Cond, DAG))
25344     Cond = Cond.getOperand(0);
25345 
25346   EVT CondVT = Cond.getValueType();
25347 
25348   // Add an AND with 1 if we don't already have one.
25349   if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25350     Cond =
25351         DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25352 
25353   SDValue LHS = Cond;
25354   SDValue RHS = DAG.getConstant(0, dl, CondVT);
25355 
25356   SDValue CCVal;
25357   SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25358   return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25359                      EFLAGS);
25360 }
25361 
25362 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25363 // Calls to _alloca are needed to probe the stack when allocating more than 4k
25364 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
25365 // that the guard pages used by the OS virtual memory manager are allocated in
25366 // correct sequence.
25367 SDValue
25368 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25369                                            SelectionDAG &DAG) const {
25370   MachineFunction &MF = DAG.getMachineFunction();
25371   bool SplitStack = MF.shouldSplitStack();
25372   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25373   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25374                SplitStack || EmitStackProbeCall;
25375   SDLoc dl(Op);
25376 
25377   // Get the inputs.
25378   SDNode *Node = Op.getNode();
25379   SDValue Chain = Op.getOperand(0);
25380   SDValue Size  = Op.getOperand(1);
25381   MaybeAlign Alignment(Op.getConstantOperandVal(2));
25382   EVT VT = Node->getValueType(0);
25383 
25384   // Chain the dynamic stack allocation so that it doesn't modify the stack
25385   // pointer when other instructions are using the stack.
25386   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25387 
25388   bool Is64Bit = Subtarget.is64Bit();
25389   MVT SPTy = getPointerTy(DAG.getDataLayout());
25390 
25391   SDValue Result;
25392   if (!Lower) {
25393     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25394     Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
25395     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25396                     " not tell us which reg is the stack pointer!");
25397 
25398     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25399     const Align StackAlign = TFI.getStackAlign();
25400     if (hasInlineStackProbe(MF)) {
25401       MachineRegisterInfo &MRI = MF.getRegInfo();
25402 
25403       const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25404       Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25405       Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25406       Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
25407                            DAG.getRegister(Vreg, SPTy));
25408     } else {
25409       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25410       Chain = SP.getValue(1);
25411       Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25412     }
25413     if (Alignment && *Alignment > StackAlign)
25414       Result =
25415           DAG.getNode(ISD::AND, dl, VT, Result,
25416                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25417     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25418   } else if (SplitStack) {
25419     MachineRegisterInfo &MRI = MF.getRegInfo();
25420 
25421     if (Is64Bit) {
25422       // The 64 bit implementation of segmented stacks needs to clobber both r10
25423       // r11. This makes it impossible to use it along with nested parameters.
25424       const Function &F = MF.getFunction();
25425       for (const auto &A : F.args()) {
25426         if (A.hasNestAttr())
25427           report_fatal_error("Cannot use segmented stacks with functions that "
25428                              "have nested arguments.");
25429       }
25430     }
25431 
25432     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25433     Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25434     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25435     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
25436                                 DAG.getRegister(Vreg, SPTy));
25437   } else {
25438     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25439     Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25440     MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25441 
25442     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25443     Register SPReg = RegInfo->getStackRegister();
25444     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25445     Chain = SP.getValue(1);
25446 
25447     if (Alignment) {
25448       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25449                        DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25450       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25451     }
25452 
25453     Result = SP;
25454   }
25455 
25456   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
25457                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
25458 
25459   SDValue Ops[2] = {Result, Chain};
25460   return DAG.getMergeValues(Ops, dl);
25461 }
25462 
25463 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25464   MachineFunction &MF = DAG.getMachineFunction();
25465   auto PtrVT = getPointerTy(MF.getDataLayout());
25466   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25467 
25468   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25469   SDLoc DL(Op);
25470 
25471   if (!Subtarget.is64Bit() ||
25472       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25473     // vastart just stores the address of the VarArgsFrameIndex slot into the
25474     // memory location argument.
25475     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25476     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25477                         MachinePointerInfo(SV));
25478   }
25479 
25480   // __va_list_tag:
25481   //   gp_offset         (0 - 6 * 8)
25482   //   fp_offset         (48 - 48 + 8 * 16)
25483   //   overflow_arg_area (point to parameters coming in memory).
25484   //   reg_save_area
25485   SmallVector<SDValue, 8> MemOps;
25486   SDValue FIN = Op.getOperand(1);
25487   // Store gp_offset
25488   SDValue Store = DAG.getStore(
25489       Op.getOperand(0), DL,
25490       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25491       MachinePointerInfo(SV));
25492   MemOps.push_back(Store);
25493 
25494   // Store fp_offset
25495   FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
25496   Store = DAG.getStore(
25497       Op.getOperand(0), DL,
25498       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25499       MachinePointerInfo(SV, 4));
25500   MemOps.push_back(Store);
25501 
25502   // Store ptr to overflow_arg_area
25503   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25504   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25505   Store =
25506       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25507   MemOps.push_back(Store);
25508 
25509   // Store ptr to reg_save_area.
25510   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25511       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25512   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25513   Store = DAG.getStore(
25514       Op.getOperand(0), DL, RSFIN, FIN,
25515       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25516   MemOps.push_back(Store);
25517   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25518 }
25519 
25520 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25521   assert(Subtarget.is64Bit() &&
25522          "LowerVAARG only handles 64-bit va_arg!");
25523   assert(Op.getNumOperands() == 4);
25524 
25525   MachineFunction &MF = DAG.getMachineFunction();
25526   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25527     // The Win64 ABI uses char* instead of a structure.
25528     return DAG.expandVAArg(Op.getNode());
25529 
25530   SDValue Chain = Op.getOperand(0);
25531   SDValue SrcPtr = Op.getOperand(1);
25532   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25533   unsigned Align = Op.getConstantOperandVal(3);
25534   SDLoc dl(Op);
25535 
25536   EVT ArgVT = Op.getNode()->getValueType(0);
25537   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25538   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25539   uint8_t ArgMode;
25540 
25541   // Decide which area this value should be read from.
25542   // TODO: Implement the AMD64 ABI in its entirety. This simple
25543   // selection mechanism works only for the basic types.
25544   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25545   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25546     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
25547   } else {
25548     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25549            "Unhandled argument type in LowerVAARG");
25550     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
25551   }
25552 
25553   if (ArgMode == 2) {
25554     // Make sure using fp_offset makes sense.
25555     assert(!Subtarget.useSoftFloat() &&
25556            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25557            Subtarget.hasSSE1());
25558   }
25559 
25560   // Insert VAARG node into the DAG
25561   // VAARG returns two values: Variable Argument Address, Chain
25562   SDValue InstOps[] = {Chain, SrcPtr,
25563                        DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25564                        DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25565                        DAG.getTargetConstant(Align, dl, MVT::i32)};
25566   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25567   SDValue VAARG = DAG.getMemIntrinsicNode(
25568       Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
25569       VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25570       /*Alignment=*/None,
25571       MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
25572   Chain = VAARG.getValue(1);
25573 
25574   // Load the next argument and return it
25575   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25576 }
25577 
25578 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25579                            SelectionDAG &DAG) {
25580   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25581   // where a va_list is still an i8*.
25582   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25583   if (Subtarget.isCallingConvWin64(
25584         DAG.getMachineFunction().getFunction().getCallingConv()))
25585     // Probably a Win64 va_copy.
25586     return DAG.expandVACopy(Op.getNode());
25587 
25588   SDValue Chain = Op.getOperand(0);
25589   SDValue DstPtr = Op.getOperand(1);
25590   SDValue SrcPtr = Op.getOperand(2);
25591   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25592   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25593   SDLoc DL(Op);
25594 
25595   return DAG.getMemcpy(
25596       Chain, DL, DstPtr, SrcPtr,
25597       DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25598       Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25599       false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25600 }
25601 
25602 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25603 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25604   switch (Opc) {
25605   case ISD::SHL:
25606   case X86ISD::VSHL:
25607   case X86ISD::VSHLI:
25608     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25609   case ISD::SRL:
25610   case X86ISD::VSRL:
25611   case X86ISD::VSRLI:
25612     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25613   case ISD::SRA:
25614   case X86ISD::VSRA:
25615   case X86ISD::VSRAI:
25616     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25617   }
25618   llvm_unreachable("Unknown target vector shift node");
25619 }
25620 
25621 /// Handle vector element shifts where the shift amount is a constant.
25622 /// Takes immediate version of shift as input.
25623 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25624                                           SDValue SrcOp, uint64_t ShiftAmt,
25625                                           SelectionDAG &DAG) {
25626   MVT ElementType = VT.getVectorElementType();
25627 
25628   // Bitcast the source vector to the output type, this is mainly necessary for
25629   // vXi8/vXi64 shifts.
25630   if (VT != SrcOp.getSimpleValueType())
25631     SrcOp = DAG.getBitcast(VT, SrcOp);
25632 
25633   // Fold this packed shift into its first operand if ShiftAmt is 0.
25634   if (ShiftAmt == 0)
25635     return SrcOp;
25636 
25637   // Check for ShiftAmt >= element width
25638   if (ShiftAmt >= ElementType.getSizeInBits()) {
25639     if (Opc == X86ISD::VSRAI)
25640       ShiftAmt = ElementType.getSizeInBits() - 1;
25641     else
25642       return DAG.getConstant(0, dl, VT);
25643   }
25644 
25645   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25646          && "Unknown target vector shift-by-constant node");
25647 
25648   // Fold this packed vector shift into a build vector if SrcOp is a
25649   // vector of Constants or UNDEFs.
25650   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
25651     SmallVector<SDValue, 8> Elts;
25652     unsigned NumElts = SrcOp->getNumOperands();
25653 
25654     switch (Opc) {
25655     default: llvm_unreachable("Unknown opcode!");
25656     case X86ISD::VSHLI:
25657       for (unsigned i = 0; i != NumElts; ++i) {
25658         SDValue CurrentOp = SrcOp->getOperand(i);
25659         if (CurrentOp->isUndef()) {
25660           // Must produce 0s in the correct bits.
25661           Elts.push_back(DAG.getConstant(0, dl, ElementType));
25662           continue;
25663         }
25664         auto *ND = cast<ConstantSDNode>(CurrentOp);
25665         const APInt &C = ND->getAPIntValue();
25666         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
25667       }
25668       break;
25669     case X86ISD::VSRLI:
25670       for (unsigned i = 0; i != NumElts; ++i) {
25671         SDValue CurrentOp = SrcOp->getOperand(i);
25672         if (CurrentOp->isUndef()) {
25673           // Must produce 0s in the correct bits.
25674           Elts.push_back(DAG.getConstant(0, dl, ElementType));
25675           continue;
25676         }
25677         auto *ND = cast<ConstantSDNode>(CurrentOp);
25678         const APInt &C = ND->getAPIntValue();
25679         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
25680       }
25681       break;
25682     case X86ISD::VSRAI:
25683       for (unsigned i = 0; i != NumElts; ++i) {
25684         SDValue CurrentOp = SrcOp->getOperand(i);
25685         if (CurrentOp->isUndef()) {
25686           // All shifted in bits must be the same so use 0.
25687           Elts.push_back(DAG.getConstant(0, dl, ElementType));
25688           continue;
25689         }
25690         auto *ND = cast<ConstantSDNode>(CurrentOp);
25691         const APInt &C = ND->getAPIntValue();
25692         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
25693       }
25694       break;
25695     }
25696 
25697     return DAG.getBuildVector(VT, dl, Elts);
25698   }
25699 
25700   return DAG.getNode(Opc, dl, VT, SrcOp,
25701                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25702 }
25703 
25704 /// Handle vector element shifts where the shift amount may or may not be a
25705 /// constant. Takes immediate version of shift as input.
25706 /// TODO: Replace with vector + (splat) idx to avoid extract_element nodes.
25707 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25708                                    SDValue SrcOp, SDValue ShAmt,
25709                                    const X86Subtarget &Subtarget,
25710                                    SelectionDAG &DAG) {
25711   MVT SVT = ShAmt.getSimpleValueType();
25712   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
25713 
25714   // Change opcode to non-immediate version.
25715   Opc = getTargetVShiftUniformOpcode(Opc, true);
25716 
25717   // Need to build a vector containing shift amount.
25718   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
25719   // +====================+============+=======================================+
25720   // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
25721   // +====================+============+=======================================+
25722   // | i64                | Yes, No    | Use ShAmt as lowest elt               |
25723   // | i32                | Yes        | zero-extend in-reg                    |
25724   // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
25725   // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
25726   // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
25727   // +====================+============+=======================================+
25728 
25729   if (SVT == MVT::i64)
25730     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25731   else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25732            ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25733            (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25734             ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25735     ShAmt = ShAmt.getOperand(0);
25736     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25737     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25738     if (Subtarget.hasSSE41())
25739       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25740                           MVT::v2i64, ShAmt);
25741     else {
25742       SDValue ByteShift = DAG.getTargetConstant(
25743           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25744       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25745       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25746                           ByteShift);
25747       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25748                           ByteShift);
25749     }
25750   } else if (Subtarget.hasSSE41() &&
25751              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25752     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25753     ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25754                         MVT::v2i64, ShAmt);
25755   } else {
25756     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25757                         DAG.getUNDEF(SVT)};
25758     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25759   }
25760 
25761   // The return type has to be a 128-bit type with the same element
25762   // type as the input type.
25763   MVT EltVT = VT.getVectorElementType();
25764   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25765 
25766   ShAmt = DAG.getBitcast(ShVT, ShAmt);
25767   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25768 }
25769 
25770 /// Return Mask with the necessary casting or extending
25771 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
25772 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25773                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
25774                            const SDLoc &dl) {
25775 
25776   if (isAllOnesConstant(Mask))
25777     return DAG.getConstant(1, dl, MaskVT);
25778   if (X86::isZeroNode(Mask))
25779     return DAG.getConstant(0, dl, MaskVT);
25780 
25781   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25782 
25783   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25784     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25785     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25786     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25787     SDValue Lo, Hi;
25788     Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25789                         DAG.getConstant(0, dl, MVT::i32));
25790     Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25791                         DAG.getConstant(1, dl, MVT::i32));
25792 
25793     Lo = DAG.getBitcast(MVT::v32i1, Lo);
25794     Hi = DAG.getBitcast(MVT::v32i1, Hi);
25795 
25796     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25797   } else {
25798     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25799                                      Mask.getSimpleValueType().getSizeInBits());
25800     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25801     // are extracted by EXTRACT_SUBVECTOR.
25802     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25803                        DAG.getBitcast(BitcastVT, Mask),
25804                        DAG.getIntPtrConstant(0, dl));
25805   }
25806 }
25807 
25808 /// Return (and \p Op, \p Mask) for compare instructions or
25809 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25810 /// necessary casting or extending for \p Mask when lowering masking intrinsics
25811 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25812                                     SDValue PreservedSrc,
25813                                     const X86Subtarget &Subtarget,
25814                                     SelectionDAG &DAG) {
25815   MVT VT = Op.getSimpleValueType();
25816   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25817   unsigned OpcodeSelect = ISD::VSELECT;
25818   SDLoc dl(Op);
25819 
25820   if (isAllOnesConstant(Mask))
25821     return Op;
25822 
25823   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25824 
25825   if (PreservedSrc.isUndef())
25826     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25827   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25828 }
25829 
25830 /// Creates an SDNode for a predicated scalar operation.
25831 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25832 /// The mask is coming as MVT::i8 and it should be transformed
25833 /// to MVT::v1i1 while lowering masking intrinsics.
25834 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25835 /// "X86select" instead of "vselect". We just can't create the "vselect" node
25836 /// for a scalar instruction.
25837 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25838                                     SDValue PreservedSrc,
25839                                     const X86Subtarget &Subtarget,
25840                                     SelectionDAG &DAG) {
25841 
25842   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25843     if (MaskConst->getZExtValue() & 0x1)
25844       return Op;
25845 
25846   MVT VT = Op.getSimpleValueType();
25847   SDLoc dl(Op);
25848 
25849   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25850   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25851                               DAG.getBitcast(MVT::v8i1, Mask),
25852                               DAG.getIntPtrConstant(0, dl));
25853   if (Op.getOpcode() == X86ISD::FSETCCM ||
25854       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25855       Op.getOpcode() == X86ISD::VFPCLASSS)
25856     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25857 
25858   if (PreservedSrc.isUndef())
25859     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25860   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25861 }
25862 
25863 static int getSEHRegistrationNodeSize(const Function *Fn) {
25864   if (!Fn->hasPersonalityFn())
25865     report_fatal_error(
25866         "querying registration node size for function without personality");
25867   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25868   // WinEHStatePass for the full struct definition.
25869   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25870   case EHPersonality::MSVC_X86SEH: return 24;
25871   case EHPersonality::MSVC_CXX: return 16;
25872   default: break;
25873   }
25874   report_fatal_error(
25875       "can only recover FP for 32-bit MSVC EH personality functions");
25876 }
25877 
25878 /// When the MSVC runtime transfers control to us, either to an outlined
25879 /// function or when returning to a parent frame after catching an exception, we
25880 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25881 /// Here's the math:
25882 ///   RegNodeBase = EntryEBP - RegNodeSize
25883 ///   ParentFP = RegNodeBase - ParentFrameOffset
25884 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
25885 /// subtracting the offset (negative on x86) takes us back to the parent FP.
25886 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25887                                    SDValue EntryEBP) {
25888   MachineFunction &MF = DAG.getMachineFunction();
25889   SDLoc dl;
25890 
25891   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25892   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25893 
25894   // It's possible that the parent function no longer has a personality function
25895   // if the exceptional code was optimized away, in which case we just return
25896   // the incoming EBP.
25897   if (!Fn->hasPersonalityFn())
25898     return EntryEBP;
25899 
25900   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25901   // registration, or the .set_setframe offset.
25902   MCSymbol *OffsetSym =
25903       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25904           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25905   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25906   SDValue ParentFrameOffset =
25907       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25908 
25909   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25910   // prologue to RBP in the parent function.
25911   const X86Subtarget &Subtarget =
25912       static_cast<const X86Subtarget &>(DAG.getSubtarget());
25913   if (Subtarget.is64Bit())
25914     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25915 
25916   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25917   // RegNodeBase = EntryEBP - RegNodeSize
25918   // ParentFP = RegNodeBase - ParentFrameOffset
25919   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25920                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
25921   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25922 }
25923 
25924 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25925                                                    SelectionDAG &DAG) const {
25926   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25927   auto isRoundModeCurDirection = [](SDValue Rnd) {
25928     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25929       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25930 
25931     return false;
25932   };
25933   auto isRoundModeSAE = [](SDValue Rnd) {
25934     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25935       unsigned RC = C->getZExtValue();
25936       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25937         // Clear the NO_EXC bit and check remaining bits.
25938         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25939         // As a convenience we allow no other bits or explicitly
25940         // current direction.
25941         return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25942       }
25943     }
25944 
25945     return false;
25946   };
25947   auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25948     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25949       RC = C->getZExtValue();
25950       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25951         // Clear the NO_EXC bit and check remaining bits.
25952         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25953         return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25954                RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25955                RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25956                RC == X86::STATIC_ROUNDING::TO_ZERO;
25957       }
25958     }
25959 
25960     return false;
25961   };
25962 
25963   SDLoc dl(Op);
25964   unsigned IntNo = Op.getConstantOperandVal(0);
25965   MVT VT = Op.getSimpleValueType();
25966   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25967 
25968   // Propagate flags from original node to transformed node(s).
25969   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25970 
25971   if (IntrData) {
25972     switch(IntrData->Type) {
25973     case INTR_TYPE_1OP: {
25974       // We specify 2 possible opcodes for intrinsics with rounding modes.
25975       // First, we check if the intrinsic may have non-default rounding mode,
25976       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25977       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25978       if (IntrWithRoundingModeOpcode != 0) {
25979         SDValue Rnd = Op.getOperand(2);
25980         unsigned RC = 0;
25981         if (isRoundModeSAEToX(Rnd, RC))
25982           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25983                              Op.getOperand(1),
25984                              DAG.getTargetConstant(RC, dl, MVT::i32));
25985         if (!isRoundModeCurDirection(Rnd))
25986           return SDValue();
25987       }
25988       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25989                          Op.getOperand(1));
25990     }
25991     case INTR_TYPE_1OP_SAE: {
25992       SDValue Sae = Op.getOperand(2);
25993 
25994       unsigned Opc;
25995       if (isRoundModeCurDirection(Sae))
25996         Opc = IntrData->Opc0;
25997       else if (isRoundModeSAE(Sae))
25998         Opc = IntrData->Opc1;
25999       else
26000         return SDValue();
26001 
26002       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26003     }
26004     case INTR_TYPE_2OP: {
26005       SDValue Src2 = Op.getOperand(2);
26006 
26007       // We specify 2 possible opcodes for intrinsics with rounding modes.
26008       // First, we check if the intrinsic may have non-default rounding mode,
26009       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26010       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26011       if (IntrWithRoundingModeOpcode != 0) {
26012         SDValue Rnd = Op.getOperand(3);
26013         unsigned RC = 0;
26014         if (isRoundModeSAEToX(Rnd, RC))
26015           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26016                              Op.getOperand(1), Src2,
26017                              DAG.getTargetConstant(RC, dl, MVT::i32));
26018         if (!isRoundModeCurDirection(Rnd))
26019           return SDValue();
26020       }
26021 
26022       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26023                          Op.getOperand(1), Src2);
26024     }
26025     case INTR_TYPE_2OP_SAE: {
26026       SDValue Sae = Op.getOperand(3);
26027 
26028       unsigned Opc;
26029       if (isRoundModeCurDirection(Sae))
26030         Opc = IntrData->Opc0;
26031       else if (isRoundModeSAE(Sae))
26032         Opc = IntrData->Opc1;
26033       else
26034         return SDValue();
26035 
26036       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26037                          Op.getOperand(2));
26038     }
26039     case INTR_TYPE_3OP:
26040     case INTR_TYPE_3OP_IMM8: {
26041       SDValue Src1 = Op.getOperand(1);
26042       SDValue Src2 = Op.getOperand(2);
26043       SDValue Src3 = Op.getOperand(3);
26044 
26045       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26046           Src3.getValueType() != MVT::i8) {
26047         Src3 = DAG.getTargetConstant(
26048             cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
26049       }
26050 
26051       // We specify 2 possible opcodes for intrinsics with rounding modes.
26052       // First, we check if the intrinsic may have non-default rounding mode,
26053       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26054       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26055       if (IntrWithRoundingModeOpcode != 0) {
26056         SDValue Rnd = Op.getOperand(4);
26057         unsigned RC = 0;
26058         if (isRoundModeSAEToX(Rnd, RC))
26059           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26060                              Src1, Src2, Src3,
26061                              DAG.getTargetConstant(RC, dl, MVT::i32));
26062         if (!isRoundModeCurDirection(Rnd))
26063           return SDValue();
26064       }
26065 
26066       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26067                          {Src1, Src2, Src3});
26068     }
26069     case INTR_TYPE_4OP_IMM8: {
26070       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26071       SDValue Src4 = Op.getOperand(4);
26072       if (Src4.getValueType() != MVT::i8) {
26073         Src4 = DAG.getTargetConstant(
26074             cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
26075       }
26076 
26077       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26078                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26079                          Src4);
26080     }
26081     case INTR_TYPE_1OP_MASK: {
26082       SDValue Src = Op.getOperand(1);
26083       SDValue PassThru = Op.getOperand(2);
26084       SDValue Mask = Op.getOperand(3);
26085       // We add rounding mode to the Node when
26086       //   - RC Opcode is specified and
26087       //   - RC is not "current direction".
26088       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26089       if (IntrWithRoundingModeOpcode != 0) {
26090         SDValue Rnd = Op.getOperand(4);
26091         unsigned RC = 0;
26092         if (isRoundModeSAEToX(Rnd, RC))
26093           return getVectorMaskingNode(
26094               DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26095                           Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26096               Mask, PassThru, Subtarget, DAG);
26097         if (!isRoundModeCurDirection(Rnd))
26098           return SDValue();
26099       }
26100       return getVectorMaskingNode(
26101           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26102           Subtarget, DAG);
26103     }
26104     case INTR_TYPE_1OP_MASK_SAE: {
26105       SDValue Src = Op.getOperand(1);
26106       SDValue PassThru = Op.getOperand(2);
26107       SDValue Mask = Op.getOperand(3);
26108       SDValue Rnd = Op.getOperand(4);
26109 
26110       unsigned Opc;
26111       if (isRoundModeCurDirection(Rnd))
26112         Opc = IntrData->Opc0;
26113       else if (isRoundModeSAE(Rnd))
26114         Opc = IntrData->Opc1;
26115       else
26116         return SDValue();
26117 
26118       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26119                                   Subtarget, DAG);
26120     }
26121     case INTR_TYPE_SCALAR_MASK: {
26122       SDValue Src1 = Op.getOperand(1);
26123       SDValue Src2 = Op.getOperand(2);
26124       SDValue passThru = Op.getOperand(3);
26125       SDValue Mask = Op.getOperand(4);
26126       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26127       // There are 2 kinds of intrinsics in this group:
26128       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26129       // (2) With rounding mode and sae - 7 operands.
26130       bool HasRounding = IntrWithRoundingModeOpcode != 0;
26131       if (Op.getNumOperands() == (5U + HasRounding)) {
26132         if (HasRounding) {
26133           SDValue Rnd = Op.getOperand(5);
26134           unsigned RC = 0;
26135           if (isRoundModeSAEToX(Rnd, RC))
26136             return getScalarMaskingNode(
26137                 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26138                             DAG.getTargetConstant(RC, dl, MVT::i32)),
26139                 Mask, passThru, Subtarget, DAG);
26140           if (!isRoundModeCurDirection(Rnd))
26141             return SDValue();
26142         }
26143         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26144                                                 Src2),
26145                                     Mask, passThru, Subtarget, DAG);
26146       }
26147 
26148       assert(Op.getNumOperands() == (6U + HasRounding) &&
26149              "Unexpected intrinsic form");
26150       SDValue RoundingMode = Op.getOperand(5);
26151       unsigned Opc = IntrData->Opc0;
26152       if (HasRounding) {
26153         SDValue Sae = Op.getOperand(6);
26154         if (isRoundModeSAE(Sae))
26155           Opc = IntrWithRoundingModeOpcode;
26156         else if (!isRoundModeCurDirection(Sae))
26157           return SDValue();
26158       }
26159       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26160                                               Src2, RoundingMode),
26161                                   Mask, passThru, Subtarget, DAG);
26162     }
26163     case INTR_TYPE_SCALAR_MASK_RND: {
26164       SDValue Src1 = Op.getOperand(1);
26165       SDValue Src2 = Op.getOperand(2);
26166       SDValue passThru = Op.getOperand(3);
26167       SDValue Mask = Op.getOperand(4);
26168       SDValue Rnd = Op.getOperand(5);
26169 
26170       SDValue NewOp;
26171       unsigned RC = 0;
26172       if (isRoundModeCurDirection(Rnd))
26173         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26174       else if (isRoundModeSAEToX(Rnd, RC))
26175         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26176                             DAG.getTargetConstant(RC, dl, MVT::i32));
26177       else
26178         return SDValue();
26179 
26180       return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26181     }
26182     case INTR_TYPE_SCALAR_MASK_SAE: {
26183       SDValue Src1 = Op.getOperand(1);
26184       SDValue Src2 = Op.getOperand(2);
26185       SDValue passThru = Op.getOperand(3);
26186       SDValue Mask = Op.getOperand(4);
26187       SDValue Sae = Op.getOperand(5);
26188       unsigned Opc;
26189       if (isRoundModeCurDirection(Sae))
26190         Opc = IntrData->Opc0;
26191       else if (isRoundModeSAE(Sae))
26192         Opc = IntrData->Opc1;
26193       else
26194         return SDValue();
26195 
26196       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26197                                   Mask, passThru, Subtarget, DAG);
26198     }
26199     case INTR_TYPE_2OP_MASK: {
26200       SDValue Src1 = Op.getOperand(1);
26201       SDValue Src2 = Op.getOperand(2);
26202       SDValue PassThru = Op.getOperand(3);
26203       SDValue Mask = Op.getOperand(4);
26204       SDValue NewOp;
26205       if (IntrData->Opc1 != 0) {
26206         SDValue Rnd = Op.getOperand(5);
26207         unsigned RC = 0;
26208         if (isRoundModeSAEToX(Rnd, RC))
26209           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26210                               DAG.getTargetConstant(RC, dl, MVT::i32));
26211         else if (!isRoundModeCurDirection(Rnd))
26212           return SDValue();
26213       }
26214       if (!NewOp)
26215         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26216       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26217     }
26218     case INTR_TYPE_2OP_MASK_SAE: {
26219       SDValue Src1 = Op.getOperand(1);
26220       SDValue Src2 = Op.getOperand(2);
26221       SDValue PassThru = Op.getOperand(3);
26222       SDValue Mask = Op.getOperand(4);
26223 
26224       unsigned Opc = IntrData->Opc0;
26225       if (IntrData->Opc1 != 0) {
26226         SDValue Sae = Op.getOperand(5);
26227         if (isRoundModeSAE(Sae))
26228           Opc = IntrData->Opc1;
26229         else if (!isRoundModeCurDirection(Sae))
26230           return SDValue();
26231       }
26232 
26233       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26234                                   Mask, PassThru, Subtarget, DAG);
26235     }
26236     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
26237       SDValue Src1 = Op.getOperand(1);
26238       SDValue Src2 = Op.getOperand(2);
26239       SDValue Src3 = Op.getOperand(3);
26240       SDValue PassThru = Op.getOperand(4);
26241       SDValue Mask = Op.getOperand(5);
26242       SDValue Sae = Op.getOperand(6);
26243       unsigned Opc;
26244       if (isRoundModeCurDirection(Sae))
26245         Opc = IntrData->Opc0;
26246       else if (isRoundModeSAE(Sae))
26247         Opc = IntrData->Opc1;
26248       else
26249         return SDValue();
26250 
26251       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26252                                   Mask, PassThru, Subtarget, DAG);
26253     }
26254     case INTR_TYPE_3OP_MASK_SAE: {
26255       SDValue Src1 = Op.getOperand(1);
26256       SDValue Src2 = Op.getOperand(2);
26257       SDValue Src3 = Op.getOperand(3);
26258       SDValue PassThru = Op.getOperand(4);
26259       SDValue Mask = Op.getOperand(5);
26260 
26261       unsigned Opc = IntrData->Opc0;
26262       if (IntrData->Opc1 != 0) {
26263         SDValue Sae = Op.getOperand(6);
26264         if (isRoundModeSAE(Sae))
26265           Opc = IntrData->Opc1;
26266         else if (!isRoundModeCurDirection(Sae))
26267           return SDValue();
26268       }
26269       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26270                                   Mask, PassThru, Subtarget, DAG);
26271     }
26272     case BLENDV: {
26273       SDValue Src1 = Op.getOperand(1);
26274       SDValue Src2 = Op.getOperand(2);
26275       SDValue Src3 = Op.getOperand(3);
26276 
26277       EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26278       Src3 = DAG.getBitcast(MaskVT, Src3);
26279 
26280       // Reverse the operands to match VSELECT order.
26281       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26282     }
26283     case VPERM_2OP : {
26284       SDValue Src1 = Op.getOperand(1);
26285       SDValue Src2 = Op.getOperand(2);
26286 
26287       // Swap Src1 and Src2 in the node creation
26288       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26289     }
26290     case CFMA_OP_MASKZ:
26291     case CFMA_OP_MASK: {
26292       SDValue Src1 = Op.getOperand(1);
26293       SDValue Src2 = Op.getOperand(2);
26294       SDValue Src3 = Op.getOperand(3);
26295       SDValue Mask = Op.getOperand(4);
26296       MVT VT = Op.getSimpleValueType();
26297 
26298       SDValue PassThru = Src3;
26299       if (IntrData->Type == CFMA_OP_MASKZ)
26300         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26301 
26302       // We add rounding mode to the Node when
26303       //   - RC Opcode is specified and
26304       //   - RC is not "current direction".
26305       SDValue NewOp;
26306       if (IntrData->Opc1 != 0) {
26307         SDValue Rnd = Op.getOperand(5);
26308         unsigned RC = 0;
26309         if (isRoundModeSAEToX(Rnd, RC))
26310           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26311                               DAG.getTargetConstant(RC, dl, MVT::i32));
26312         else if (!isRoundModeCurDirection(Rnd))
26313           return SDValue();
26314       }
26315       if (!NewOp)
26316         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26317       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26318     }
26319     case IFMA_OP:
26320       // NOTE: We need to swizzle the operands to pass the multiply operands
26321       // first.
26322       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26323                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26324     case FPCLASSS: {
26325       SDValue Src1 = Op.getOperand(1);
26326       SDValue Imm = Op.getOperand(2);
26327       SDValue Mask = Op.getOperand(3);
26328       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26329       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26330                                                  Subtarget, DAG);
26331       // Need to fill with zeros to ensure the bitcast will produce zeroes
26332       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26333       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26334                                 DAG.getConstant(0, dl, MVT::v8i1),
26335                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
26336       return DAG.getBitcast(MVT::i8, Ins);
26337     }
26338 
26339     case CMP_MASK_CC: {
26340       MVT MaskVT = Op.getSimpleValueType();
26341       SDValue CC = Op.getOperand(3);
26342       SDValue Mask = Op.getOperand(4);
26343       // We specify 2 possible opcodes for intrinsics with rounding modes.
26344       // First, we check if the intrinsic may have non-default rounding mode,
26345       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26346       if (IntrData->Opc1 != 0) {
26347         SDValue Sae = Op.getOperand(5);
26348         if (isRoundModeSAE(Sae))
26349           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26350                              Op.getOperand(2), CC, Mask, Sae);
26351         if (!isRoundModeCurDirection(Sae))
26352           return SDValue();
26353       }
26354       //default rounding mode
26355       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26356                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26357     }
26358     case CMP_MASK_SCALAR_CC: {
26359       SDValue Src1 = Op.getOperand(1);
26360       SDValue Src2 = Op.getOperand(2);
26361       SDValue CC = Op.getOperand(3);
26362       SDValue Mask = Op.getOperand(4);
26363 
26364       SDValue Cmp;
26365       if (IntrData->Opc1 != 0) {
26366         SDValue Sae = Op.getOperand(5);
26367         if (isRoundModeSAE(Sae))
26368           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26369         else if (!isRoundModeCurDirection(Sae))
26370           return SDValue();
26371       }
26372       //default rounding mode
26373       if (!Cmp.getNode())
26374         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26375 
26376       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26377                                              Subtarget, DAG);
26378       // Need to fill with zeros to ensure the bitcast will produce zeroes
26379       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26380       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26381                                 DAG.getConstant(0, dl, MVT::v8i1),
26382                                 CmpMask, DAG.getIntPtrConstant(0, dl));
26383       return DAG.getBitcast(MVT::i8, Ins);
26384     }
26385     case COMI: { // Comparison intrinsics
26386       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26387       SDValue LHS = Op.getOperand(1);
26388       SDValue RHS = Op.getOperand(2);
26389       // Some conditions require the operands to be swapped.
26390       if (CC == ISD::SETLT || CC == ISD::SETLE)
26391         std::swap(LHS, RHS);
26392 
26393       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
26394       SDValue SetCC;
26395       switch (CC) {
26396       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
26397         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26398         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26399         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26400         break;
26401       }
26402       case ISD::SETNE: { // (ZF = 1 or PF = 1)
26403         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26404         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26405         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26406         break;
26407       }
26408       case ISD::SETGT: // (CF = 0 and ZF = 0)
26409       case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26410         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26411         break;
26412       }
26413       case ISD::SETGE: // CF = 0
26414       case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26415         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26416         break;
26417       default:
26418         llvm_unreachable("Unexpected illegal condition!");
26419       }
26420       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26421     }
26422     case COMI_RM: { // Comparison intrinsics with Sae
26423       SDValue LHS = Op.getOperand(1);
26424       SDValue RHS = Op.getOperand(2);
26425       unsigned CondVal = Op.getConstantOperandVal(3);
26426       SDValue Sae = Op.getOperand(4);
26427 
26428       SDValue FCmp;
26429       if (isRoundModeCurDirection(Sae))
26430         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26431                            DAG.getTargetConstant(CondVal, dl, MVT::i8));
26432       else if (isRoundModeSAE(Sae))
26433         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26434                            DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26435       else
26436         return SDValue();
26437       // Need to fill with zeros to ensure the bitcast will produce zeroes
26438       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26439       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26440                                 DAG.getConstant(0, dl, MVT::v16i1),
26441                                 FCmp, DAG.getIntPtrConstant(0, dl));
26442       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26443                          DAG.getBitcast(MVT::i16, Ins));
26444     }
26445     case VSHIFT: {
26446       SDValue SrcOp = Op.getOperand(1);
26447       SDValue ShAmt = Op.getOperand(2);
26448 
26449       // Catch shift-by-constant.
26450       if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26451         return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26452                                           Op.getSimpleValueType(), SrcOp,
26453                                           CShAmt->getZExtValue(), DAG);
26454 
26455       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26456                                  SrcOp, ShAmt, Subtarget, DAG);
26457     }
26458     case COMPRESS_EXPAND_IN_REG: {
26459       SDValue Mask = Op.getOperand(3);
26460       SDValue DataToCompress = Op.getOperand(1);
26461       SDValue PassThru = Op.getOperand(2);
26462       if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26463         return Op.getOperand(1);
26464 
26465       // Avoid false dependency.
26466       if (PassThru.isUndef())
26467         PassThru = DAG.getConstant(0, dl, VT);
26468 
26469       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26470                          Mask);
26471     }
26472     case FIXUPIMM:
26473     case FIXUPIMM_MASKZ: {
26474       SDValue Src1 = Op.getOperand(1);
26475       SDValue Src2 = Op.getOperand(2);
26476       SDValue Src3 = Op.getOperand(3);
26477       SDValue Imm = Op.getOperand(4);
26478       SDValue Mask = Op.getOperand(5);
26479       SDValue Passthru = (IntrData->Type == FIXUPIMM)
26480                              ? Src1
26481                              : getZeroVector(VT, Subtarget, DAG, dl);
26482 
26483       unsigned Opc = IntrData->Opc0;
26484       if (IntrData->Opc1 != 0) {
26485         SDValue Sae = Op.getOperand(6);
26486         if (isRoundModeSAE(Sae))
26487           Opc = IntrData->Opc1;
26488         else if (!isRoundModeCurDirection(Sae))
26489           return SDValue();
26490       }
26491 
26492       SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26493 
26494       if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26495         return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26496 
26497       return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26498     }
26499     case ROUNDP: {
26500       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26501       // Clear the upper bits of the rounding immediate so that the legacy
26502       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26503       auto Round = cast<ConstantSDNode>(Op.getOperand(2));
26504       SDValue RoundingMode =
26505           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26506       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26507                          Op.getOperand(1), RoundingMode);
26508     }
26509     case ROUNDS: {
26510       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26511       // Clear the upper bits of the rounding immediate so that the legacy
26512       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26513       auto Round = cast<ConstantSDNode>(Op.getOperand(3));
26514       SDValue RoundingMode =
26515           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26516       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26517                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
26518     }
26519     case BEXTRI: {
26520       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26521 
26522       uint64_t Imm = Op.getConstantOperandVal(2);
26523       SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26524                                               Op.getValueType());
26525       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26526                          Op.getOperand(1), Control);
26527     }
26528     // ADC/ADCX/SBB
26529     case ADX: {
26530       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26531       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26532 
26533       SDValue Res;
26534       // If the carry in is zero, then we should just use ADD/SUB instead of
26535       // ADC/SBB.
26536       if (isNullConstant(Op.getOperand(1))) {
26537         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26538                           Op.getOperand(3));
26539       } else {
26540         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26541                                     DAG.getConstant(-1, dl, MVT::i8));
26542         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26543                           Op.getOperand(3), GenCF.getValue(1));
26544       }
26545       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26546       SDValue Results[] = { SetCC, Res };
26547       return DAG.getMergeValues(Results, dl);
26548     }
26549     case CVTPD2PS_MASK:
26550     case CVTPD2DQ_MASK:
26551     case CVTQQ2PS_MASK:
26552     case TRUNCATE_TO_REG: {
26553       SDValue Src = Op.getOperand(1);
26554       SDValue PassThru = Op.getOperand(2);
26555       SDValue Mask = Op.getOperand(3);
26556 
26557       if (isAllOnesConstant(Mask))
26558         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26559 
26560       MVT SrcVT = Src.getSimpleValueType();
26561       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26562       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26563       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26564                          {Src, PassThru, Mask});
26565     }
26566     case CVTPS2PH_MASK: {
26567       SDValue Src = Op.getOperand(1);
26568       SDValue Rnd = Op.getOperand(2);
26569       SDValue PassThru = Op.getOperand(3);
26570       SDValue Mask = Op.getOperand(4);
26571 
26572       if (isAllOnesConstant(Mask))
26573         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
26574 
26575       MVT SrcVT = Src.getSimpleValueType();
26576       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26577       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26578       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
26579                          PassThru, Mask);
26580 
26581     }
26582     case CVTNEPS2BF16_MASK: {
26583       SDValue Src = Op.getOperand(1);
26584       SDValue PassThru = Op.getOperand(2);
26585       SDValue Mask = Op.getOperand(3);
26586 
26587       if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26588         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26589 
26590       // Break false dependency.
26591       if (PassThru.isUndef())
26592         PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26593 
26594       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26595                          Mask);
26596     }
26597     default:
26598       break;
26599     }
26600   }
26601 
26602   switch (IntNo) {
26603   default: return SDValue();    // Don't custom lower most intrinsics.
26604 
26605   // ptest and testp intrinsics. The intrinsic these come from are designed to
26606   // return an integer value, not just an instruction so lower it to the ptest
26607   // or testp pattern and a setcc for the result.
26608   case Intrinsic::x86_avx512_ktestc_b:
26609   case Intrinsic::x86_avx512_ktestc_w:
26610   case Intrinsic::x86_avx512_ktestc_d:
26611   case Intrinsic::x86_avx512_ktestc_q:
26612   case Intrinsic::x86_avx512_ktestz_b:
26613   case Intrinsic::x86_avx512_ktestz_w:
26614   case Intrinsic::x86_avx512_ktestz_d:
26615   case Intrinsic::x86_avx512_ktestz_q:
26616   case Intrinsic::x86_sse41_ptestz:
26617   case Intrinsic::x86_sse41_ptestc:
26618   case Intrinsic::x86_sse41_ptestnzc:
26619   case Intrinsic::x86_avx_ptestz_256:
26620   case Intrinsic::x86_avx_ptestc_256:
26621   case Intrinsic::x86_avx_ptestnzc_256:
26622   case Intrinsic::x86_avx_vtestz_ps:
26623   case Intrinsic::x86_avx_vtestc_ps:
26624   case Intrinsic::x86_avx_vtestnzc_ps:
26625   case Intrinsic::x86_avx_vtestz_pd:
26626   case Intrinsic::x86_avx_vtestc_pd:
26627   case Intrinsic::x86_avx_vtestnzc_pd:
26628   case Intrinsic::x86_avx_vtestz_ps_256:
26629   case Intrinsic::x86_avx_vtestc_ps_256:
26630   case Intrinsic::x86_avx_vtestnzc_ps_256:
26631   case Intrinsic::x86_avx_vtestz_pd_256:
26632   case Intrinsic::x86_avx_vtestc_pd_256:
26633   case Intrinsic::x86_avx_vtestnzc_pd_256: {
26634     unsigned TestOpc = X86ISD::PTEST;
26635     X86::CondCode X86CC;
26636     switch (IntNo) {
26637     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26638     case Intrinsic::x86_avx512_ktestc_b:
26639     case Intrinsic::x86_avx512_ktestc_w:
26640     case Intrinsic::x86_avx512_ktestc_d:
26641     case Intrinsic::x86_avx512_ktestc_q:
26642       // CF = 1
26643       TestOpc = X86ISD::KTEST;
26644       X86CC = X86::COND_B;
26645       break;
26646     case Intrinsic::x86_avx512_ktestz_b:
26647     case Intrinsic::x86_avx512_ktestz_w:
26648     case Intrinsic::x86_avx512_ktestz_d:
26649     case Intrinsic::x86_avx512_ktestz_q:
26650       TestOpc = X86ISD::KTEST;
26651       X86CC = X86::COND_E;
26652       break;
26653     case Intrinsic::x86_avx_vtestz_ps:
26654     case Intrinsic::x86_avx_vtestz_pd:
26655     case Intrinsic::x86_avx_vtestz_ps_256:
26656     case Intrinsic::x86_avx_vtestz_pd_256:
26657       TestOpc = X86ISD::TESTP;
26658       LLVM_FALLTHROUGH;
26659     case Intrinsic::x86_sse41_ptestz:
26660     case Intrinsic::x86_avx_ptestz_256:
26661       // ZF = 1
26662       X86CC = X86::COND_E;
26663       break;
26664     case Intrinsic::x86_avx_vtestc_ps:
26665     case Intrinsic::x86_avx_vtestc_pd:
26666     case Intrinsic::x86_avx_vtestc_ps_256:
26667     case Intrinsic::x86_avx_vtestc_pd_256:
26668       TestOpc = X86ISD::TESTP;
26669       LLVM_FALLTHROUGH;
26670     case Intrinsic::x86_sse41_ptestc:
26671     case Intrinsic::x86_avx_ptestc_256:
26672       // CF = 1
26673       X86CC = X86::COND_B;
26674       break;
26675     case Intrinsic::x86_avx_vtestnzc_ps:
26676     case Intrinsic::x86_avx_vtestnzc_pd:
26677     case Intrinsic::x86_avx_vtestnzc_ps_256:
26678     case Intrinsic::x86_avx_vtestnzc_pd_256:
26679       TestOpc = X86ISD::TESTP;
26680       LLVM_FALLTHROUGH;
26681     case Intrinsic::x86_sse41_ptestnzc:
26682     case Intrinsic::x86_avx_ptestnzc_256:
26683       // ZF and CF = 0
26684       X86CC = X86::COND_A;
26685       break;
26686     }
26687 
26688     SDValue LHS = Op.getOperand(1);
26689     SDValue RHS = Op.getOperand(2);
26690     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26691     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26692     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26693   }
26694 
26695   case Intrinsic::x86_sse42_pcmpistria128:
26696   case Intrinsic::x86_sse42_pcmpestria128:
26697   case Intrinsic::x86_sse42_pcmpistric128:
26698   case Intrinsic::x86_sse42_pcmpestric128:
26699   case Intrinsic::x86_sse42_pcmpistrio128:
26700   case Intrinsic::x86_sse42_pcmpestrio128:
26701   case Intrinsic::x86_sse42_pcmpistris128:
26702   case Intrinsic::x86_sse42_pcmpestris128:
26703   case Intrinsic::x86_sse42_pcmpistriz128:
26704   case Intrinsic::x86_sse42_pcmpestriz128: {
26705     unsigned Opcode;
26706     X86::CondCode X86CC;
26707     switch (IntNo) {
26708     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
26709     case Intrinsic::x86_sse42_pcmpistria128:
26710       Opcode = X86ISD::PCMPISTR;
26711       X86CC = X86::COND_A;
26712       break;
26713     case Intrinsic::x86_sse42_pcmpestria128:
26714       Opcode = X86ISD::PCMPESTR;
26715       X86CC = X86::COND_A;
26716       break;
26717     case Intrinsic::x86_sse42_pcmpistric128:
26718       Opcode = X86ISD::PCMPISTR;
26719       X86CC = X86::COND_B;
26720       break;
26721     case Intrinsic::x86_sse42_pcmpestric128:
26722       Opcode = X86ISD::PCMPESTR;
26723       X86CC = X86::COND_B;
26724       break;
26725     case Intrinsic::x86_sse42_pcmpistrio128:
26726       Opcode = X86ISD::PCMPISTR;
26727       X86CC = X86::COND_O;
26728       break;
26729     case Intrinsic::x86_sse42_pcmpestrio128:
26730       Opcode = X86ISD::PCMPESTR;
26731       X86CC = X86::COND_O;
26732       break;
26733     case Intrinsic::x86_sse42_pcmpistris128:
26734       Opcode = X86ISD::PCMPISTR;
26735       X86CC = X86::COND_S;
26736       break;
26737     case Intrinsic::x86_sse42_pcmpestris128:
26738       Opcode = X86ISD::PCMPESTR;
26739       X86CC = X86::COND_S;
26740       break;
26741     case Intrinsic::x86_sse42_pcmpistriz128:
26742       Opcode = X86ISD::PCMPISTR;
26743       X86CC = X86::COND_E;
26744       break;
26745     case Intrinsic::x86_sse42_pcmpestriz128:
26746       Opcode = X86ISD::PCMPESTR;
26747       X86CC = X86::COND_E;
26748       break;
26749     }
26750     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26751     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26752     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26753     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26754     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26755   }
26756 
26757   case Intrinsic::x86_sse42_pcmpistri128:
26758   case Intrinsic::x86_sse42_pcmpestri128: {
26759     unsigned Opcode;
26760     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26761       Opcode = X86ISD::PCMPISTR;
26762     else
26763       Opcode = X86ISD::PCMPESTR;
26764 
26765     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26766     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26767     return DAG.getNode(Opcode, dl, VTs, NewOps);
26768   }
26769 
26770   case Intrinsic::x86_sse42_pcmpistrm128:
26771   case Intrinsic::x86_sse42_pcmpestrm128: {
26772     unsigned Opcode;
26773     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26774       Opcode = X86ISD::PCMPISTR;
26775     else
26776       Opcode = X86ISD::PCMPESTR;
26777 
26778     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26779     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26780     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26781   }
26782 
26783   case Intrinsic::eh_sjlj_lsda: {
26784     MachineFunction &MF = DAG.getMachineFunction();
26785     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26786     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26787     auto &Context = MF.getMMI().getContext();
26788     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26789                                             Twine(MF.getFunctionNumber()));
26790     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26791                        DAG.getMCSymbol(S, PtrVT));
26792   }
26793 
26794   case Intrinsic::x86_seh_lsda: {
26795     // Compute the symbol for the LSDA. We know it'll get emitted later.
26796     MachineFunction &MF = DAG.getMachineFunction();
26797     SDValue Op1 = Op.getOperand(1);
26798     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26799     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26800         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26801 
26802     // Generate a simple absolute symbol reference. This intrinsic is only
26803     // supported on 32-bit Windows, which isn't PIC.
26804     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26805     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26806   }
26807 
26808   case Intrinsic::eh_recoverfp: {
26809     SDValue FnOp = Op.getOperand(1);
26810     SDValue IncomingFPOp = Op.getOperand(2);
26811     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26812     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26813     if (!Fn)
26814       report_fatal_error(
26815           "llvm.eh.recoverfp must take a function as the first argument");
26816     return recoverFramePointer(DAG, Fn, IncomingFPOp);
26817   }
26818 
26819   case Intrinsic::localaddress: {
26820     // Returns one of the stack, base, or frame pointer registers, depending on
26821     // which is used to reference local variables.
26822     MachineFunction &MF = DAG.getMachineFunction();
26823     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26824     unsigned Reg;
26825     if (RegInfo->hasBasePointer(MF))
26826       Reg = RegInfo->getBaseRegister();
26827     else { // Handles the SP or FP case.
26828       bool CantUseFP = RegInfo->hasStackRealignment(MF);
26829       if (CantUseFP)
26830         Reg = RegInfo->getPtrSizedStackRegister(MF);
26831       else
26832         Reg = RegInfo->getPtrSizedFrameRegister(MF);
26833     }
26834     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26835   }
26836   case Intrinsic::swift_async_context_addr: {
26837     auto &MF = DAG.getMachineFunction();
26838     auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26839     if (Subtarget.is64Bit()) {
26840       MF.getFrameInfo().setFrameAddressIsTaken(true);
26841       X86FI->setHasSwiftAsyncContext(true);
26842       return SDValue(
26843           DAG.getMachineNode(
26844               X86::SUB64ri8, dl, MVT::i64,
26845               DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26846               DAG.getTargetConstant(8, dl, MVT::i32)),
26847           0);
26848     } else {
26849       // 32-bit so no special extended frame, create or reuse an existing stack
26850       // slot.
26851       if (!X86FI->getSwiftAsyncContextFrameIdx())
26852         X86FI->setSwiftAsyncContextFrameIdx(
26853             MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26854       return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26855     }
26856   }
26857   case Intrinsic::x86_avx512_vp2intersect_q_512:
26858   case Intrinsic::x86_avx512_vp2intersect_q_256:
26859   case Intrinsic::x86_avx512_vp2intersect_q_128:
26860   case Intrinsic::x86_avx512_vp2intersect_d_512:
26861   case Intrinsic::x86_avx512_vp2intersect_d_256:
26862   case Intrinsic::x86_avx512_vp2intersect_d_128: {
26863     MVT MaskVT = Op.getSimpleValueType();
26864 
26865     SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26866     SDLoc DL(Op);
26867 
26868     SDValue Operation =
26869         DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26870                     Op->getOperand(1), Op->getOperand(2));
26871 
26872     SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26873                                                  MaskVT, Operation);
26874     SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26875                                                  MaskVT, Operation);
26876     return DAG.getMergeValues({Result0, Result1}, DL);
26877   }
26878   case Intrinsic::x86_mmx_pslli_w:
26879   case Intrinsic::x86_mmx_pslli_d:
26880   case Intrinsic::x86_mmx_pslli_q:
26881   case Intrinsic::x86_mmx_psrli_w:
26882   case Intrinsic::x86_mmx_psrli_d:
26883   case Intrinsic::x86_mmx_psrli_q:
26884   case Intrinsic::x86_mmx_psrai_w:
26885   case Intrinsic::x86_mmx_psrai_d: {
26886     SDLoc DL(Op);
26887     SDValue ShAmt = Op.getOperand(2);
26888     // If the argument is a constant, convert it to a target constant.
26889     if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26890       // Clamp out of bounds shift amounts since they will otherwise be masked
26891       // to 8-bits which may make it no longer out of bounds.
26892       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26893       if (ShiftAmount == 0)
26894         return Op.getOperand(1);
26895 
26896       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26897                          Op.getOperand(0), Op.getOperand(1),
26898                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26899     }
26900 
26901     unsigned NewIntrinsic;
26902     switch (IntNo) {
26903     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
26904     case Intrinsic::x86_mmx_pslli_w:
26905       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26906       break;
26907     case Intrinsic::x86_mmx_pslli_d:
26908       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26909       break;
26910     case Intrinsic::x86_mmx_pslli_q:
26911       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26912       break;
26913     case Intrinsic::x86_mmx_psrli_w:
26914       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26915       break;
26916     case Intrinsic::x86_mmx_psrli_d:
26917       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26918       break;
26919     case Intrinsic::x86_mmx_psrli_q:
26920       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26921       break;
26922     case Intrinsic::x86_mmx_psrai_w:
26923       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26924       break;
26925     case Intrinsic::x86_mmx_psrai_d:
26926       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26927       break;
26928     }
26929 
26930     // The vector shift intrinsics with scalars uses 32b shift amounts but
26931     // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26932     // MMX register.
26933     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26934     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26935                        DAG.getTargetConstant(NewIntrinsic, DL,
26936                                              getPointerTy(DAG.getDataLayout())),
26937                        Op.getOperand(1), ShAmt);
26938   }
26939   case Intrinsic::thread_pointer: {
26940     if (Subtarget.isTargetELF()) {
26941       SDLoc dl(Op);
26942       EVT PtrVT = getPointerTy(DAG.getDataLayout());
26943       // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26944       Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
26945           *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26946       return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26947                          DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
26948     }
26949     report_fatal_error(
26950         "Target OS doesn't support __builtin_thread_pointer() yet.");
26951   }
26952   }
26953 }
26954 
26955 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26956                                  SDValue Src, SDValue Mask, SDValue Base,
26957                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
26958                                  const X86Subtarget &Subtarget) {
26959   SDLoc dl(Op);
26960   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26961   // Scale must be constant.
26962   if (!C)
26963     return SDValue();
26964   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26965   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26966                                         TLI.getPointerTy(DAG.getDataLayout()));
26967   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26968   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26969   // If source is undef or we know it won't be used, use a zero vector
26970   // to break register dependency.
26971   // TODO: use undef instead and let BreakFalseDeps deal with it?
26972   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26973     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26974 
26975   // Cast mask to an integer type.
26976   Mask = DAG.getBitcast(MaskVT, Mask);
26977 
26978   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26979 
26980   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26981   SDValue Res =
26982       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26983                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26984   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26985 }
26986 
26987 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26988                              SDValue Src, SDValue Mask, SDValue Base,
26989                              SDValue Index, SDValue ScaleOp, SDValue Chain,
26990                              const X86Subtarget &Subtarget) {
26991   MVT VT = Op.getSimpleValueType();
26992   SDLoc dl(Op);
26993   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26994   // Scale must be constant.
26995   if (!C)
26996     return SDValue();
26997   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26998   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26999                                         TLI.getPointerTy(DAG.getDataLayout()));
27000   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27001                               VT.getVectorNumElements());
27002   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27003 
27004   // We support two versions of the gather intrinsics. One with scalar mask and
27005   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27006   if (Mask.getValueType() != MaskVT)
27007     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27008 
27009   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27010   // If source is undef or we know it won't be used, use a zero vector
27011   // to break register dependency.
27012   // TODO: use undef instead and let BreakFalseDeps deal with it?
27013   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27014     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27015 
27016   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27017 
27018   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27019   SDValue Res =
27020       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27021                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27022   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27023 }
27024 
27025 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27026                                SDValue Src, SDValue Mask, SDValue Base,
27027                                SDValue Index, SDValue ScaleOp, SDValue Chain,
27028                                const X86Subtarget &Subtarget) {
27029   SDLoc dl(Op);
27030   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27031   // Scale must be constant.
27032   if (!C)
27033     return SDValue();
27034   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27035   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27036                                         TLI.getPointerTy(DAG.getDataLayout()));
27037   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27038                               Src.getSimpleValueType().getVectorNumElements());
27039   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27040 
27041   // We support two versions of the scatter intrinsics. One with scalar mask and
27042   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27043   if (Mask.getValueType() != MaskVT)
27044     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27045 
27046   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27047 
27048   SDVTList VTs = DAG.getVTList(MVT::Other);
27049   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27050   SDValue Res =
27051       DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27052                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27053   return Res;
27054 }
27055 
27056 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27057                                SDValue Mask, SDValue Base, SDValue Index,
27058                                SDValue ScaleOp, SDValue Chain,
27059                                const X86Subtarget &Subtarget) {
27060   SDLoc dl(Op);
27061   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27062   // Scale must be constant.
27063   if (!C)
27064     return SDValue();
27065   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27066   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27067                                         TLI.getPointerTy(DAG.getDataLayout()));
27068   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27069   SDValue Segment = DAG.getRegister(0, MVT::i32);
27070   MVT MaskVT =
27071     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27072   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27073   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27074   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27075   return SDValue(Res, 0);
27076 }
27077 
27078 /// Handles the lowering of builtin intrinsics with chain that return their
27079 /// value into registers EDX:EAX.
27080 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
27081 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27082 /// TargetOpcode.
27083 /// Returns a Glue value which can be used to add extra copy-from-reg if the
27084 /// expanded intrinsics implicitly defines extra registers (i.e. not just
27085 /// EDX:EAX).
27086 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
27087                                         SelectionDAG &DAG,
27088                                         unsigned TargetOpcode,
27089                                         unsigned SrcReg,
27090                                         const X86Subtarget &Subtarget,
27091                                         SmallVectorImpl<SDValue> &Results) {
27092   SDValue Chain = N->getOperand(0);
27093   SDValue Glue;
27094 
27095   if (SrcReg) {
27096     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27097     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27098     Glue = Chain.getValue(1);
27099   }
27100 
27101   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27102   SDValue N1Ops[] = {Chain, Glue};
27103   SDNode *N1 = DAG.getMachineNode(
27104       TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27105   Chain = SDValue(N1, 0);
27106 
27107   // Reads the content of XCR and returns it in registers EDX:EAX.
27108   SDValue LO, HI;
27109   if (Subtarget.is64Bit()) {
27110     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27111     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27112                             LO.getValue(2));
27113   } else {
27114     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27115     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27116                             LO.getValue(2));
27117   }
27118   Chain = HI.getValue(1);
27119   Glue = HI.getValue(2);
27120 
27121   if (Subtarget.is64Bit()) {
27122     // Merge the two 32-bit values into a 64-bit one.
27123     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27124                               DAG.getConstant(32, DL, MVT::i8));
27125     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27126     Results.push_back(Chain);
27127     return Glue;
27128   }
27129 
27130   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27131   SDValue Ops[] = { LO, HI };
27132   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27133   Results.push_back(Pair);
27134   Results.push_back(Chain);
27135   return Glue;
27136 }
27137 
27138 /// Handles the lowering of builtin intrinsics that read the time stamp counter
27139 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27140 /// READCYCLECOUNTER nodes.
27141 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27142                                     SelectionDAG &DAG,
27143                                     const X86Subtarget &Subtarget,
27144                                     SmallVectorImpl<SDValue> &Results) {
27145   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27146   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27147   // and the EAX register is loaded with the low-order 32 bits.
27148   SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27149                                              /* NoRegister */0, Subtarget,
27150                                              Results);
27151   if (Opcode != X86::RDTSCP)
27152     return;
27153 
27154   SDValue Chain = Results[1];
27155   // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27156   // the ECX register. Add 'ecx' explicitly to the chain.
27157   SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27158   Results[1] = ecx;
27159   Results.push_back(ecx.getValue(1));
27160 }
27161 
27162 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
27163                                      SelectionDAG &DAG) {
27164   SmallVector<SDValue, 3> Results;
27165   SDLoc DL(Op);
27166   getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27167                           Results);
27168   return DAG.getMergeValues(Results, DL);
27169 }
27170 
27171 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
27172   MachineFunction &MF = DAG.getMachineFunction();
27173   SDValue Chain = Op.getOperand(0);
27174   SDValue RegNode = Op.getOperand(2);
27175   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27176   if (!EHInfo)
27177     report_fatal_error("EH registrations only live in functions using WinEH");
27178 
27179   // Cast the operand to an alloca, and remember the frame index.
27180   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27181   if (!FINode)
27182     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27183   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27184 
27185   // Return the chain operand without making any DAG nodes.
27186   return Chain;
27187 }
27188 
27189 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
27190   MachineFunction &MF = DAG.getMachineFunction();
27191   SDValue Chain = Op.getOperand(0);
27192   SDValue EHGuard = Op.getOperand(2);
27193   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27194   if (!EHInfo)
27195     report_fatal_error("EHGuard only live in functions using WinEH");
27196 
27197   // Cast the operand to an alloca, and remember the frame index.
27198   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27199   if (!FINode)
27200     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27201   EHInfo->EHGuardFrameIndex = FINode->getIndex();
27202 
27203   // Return the chain operand without making any DAG nodes.
27204   return Chain;
27205 }
27206 
27207 /// Emit Truncating Store with signed or unsigned saturation.
27208 static SDValue
27209 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
27210                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27211                 SelectionDAG &DAG) {
27212   SDVTList VTs = DAG.getVTList(MVT::Other);
27213   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27214   SDValue Ops[] = { Chain, Val, Ptr, Undef };
27215   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27216   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
27217 }
27218 
27219 /// Emit Masked Truncating Store with signed or unsigned saturation.
27220 static SDValue
27221 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
27222                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27223                       MachineMemOperand *MMO, SelectionDAG &DAG) {
27224   SDVTList VTs = DAG.getVTList(MVT::Other);
27225   SDValue Ops[] = { Chain, Val, Ptr, Mask };
27226   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27227   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
27228 }
27229 
27230 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
27231                                       SelectionDAG &DAG) {
27232   unsigned IntNo = Op.getConstantOperandVal(1);
27233   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27234   if (!IntrData) {
27235     switch (IntNo) {
27236     case llvm::Intrinsic::x86_seh_ehregnode:
27237       return MarkEHRegistrationNode(Op, DAG);
27238     case llvm::Intrinsic::x86_seh_ehguard:
27239       return MarkEHGuard(Op, DAG);
27240     case llvm::Intrinsic::x86_rdpkru: {
27241       SDLoc dl(Op);
27242       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27243       // Create a RDPKRU node and pass 0 to the ECX parameter.
27244       return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27245                          DAG.getConstant(0, dl, MVT::i32));
27246     }
27247     case llvm::Intrinsic::x86_wrpkru: {
27248       SDLoc dl(Op);
27249       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
27250       // to the EDX and ECX parameters.
27251       return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27252                          Op.getOperand(0), Op.getOperand(2),
27253                          DAG.getConstant(0, dl, MVT::i32),
27254                          DAG.getConstant(0, dl, MVT::i32));
27255     }
27256     case llvm::Intrinsic::asan_check_memaccess: {
27257       // Mark this as adjustsStack because it will be lowered to a call.
27258       DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
27259       // Don't do anything here, we will expand these intrinsics out later.
27260       return Op;
27261     }
27262     case llvm::Intrinsic::x86_flags_read_u32:
27263     case llvm::Intrinsic::x86_flags_read_u64:
27264     case llvm::Intrinsic::x86_flags_write_u32:
27265     case llvm::Intrinsic::x86_flags_write_u64: {
27266       // We need a frame pointer because this will get lowered to a PUSH/POP
27267       // sequence.
27268       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27269       MFI.setHasCopyImplyingStackAdjustment(true);
27270       // Don't do anything here, we will expand these intrinsics out later
27271       // during FinalizeISel in EmitInstrWithCustomInserter.
27272       return Op;
27273     }
27274     case Intrinsic::x86_lwpins32:
27275     case Intrinsic::x86_lwpins64:
27276     case Intrinsic::x86_umwait:
27277     case Intrinsic::x86_tpause: {
27278       SDLoc dl(Op);
27279       SDValue Chain = Op->getOperand(0);
27280       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27281       unsigned Opcode;
27282 
27283       switch (IntNo) {
27284       default: llvm_unreachable("Impossible intrinsic");
27285       case Intrinsic::x86_umwait:
27286         Opcode = X86ISD::UMWAIT;
27287         break;
27288       case Intrinsic::x86_tpause:
27289         Opcode = X86ISD::TPAUSE;
27290         break;
27291       case Intrinsic::x86_lwpins32:
27292       case Intrinsic::x86_lwpins64:
27293         Opcode = X86ISD::LWPINS;
27294         break;
27295       }
27296 
27297       SDValue Operation =
27298           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27299                       Op->getOperand(3), Op->getOperand(4));
27300       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27301       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27302                          Operation.getValue(1));
27303     }
27304     case Intrinsic::x86_enqcmd:
27305     case Intrinsic::x86_enqcmds: {
27306       SDLoc dl(Op);
27307       SDValue Chain = Op.getOperand(0);
27308       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27309       unsigned Opcode;
27310       switch (IntNo) {
27311       default: llvm_unreachable("Impossible intrinsic!");
27312       case Intrinsic::x86_enqcmd:
27313         Opcode = X86ISD::ENQCMD;
27314         break;
27315       case Intrinsic::x86_enqcmds:
27316         Opcode = X86ISD::ENQCMDS;
27317         break;
27318       }
27319       SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27320                                       Op.getOperand(3));
27321       SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27322       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27323                          Operation.getValue(1));
27324     }
27325     case Intrinsic::x86_aesenc128kl:
27326     case Intrinsic::x86_aesdec128kl:
27327     case Intrinsic::x86_aesenc256kl:
27328     case Intrinsic::x86_aesdec256kl: {
27329       SDLoc DL(Op);
27330       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27331       SDValue Chain = Op.getOperand(0);
27332       unsigned Opcode;
27333 
27334       switch (IntNo) {
27335       default: llvm_unreachable("Impossible intrinsic");
27336       case Intrinsic::x86_aesenc128kl:
27337         Opcode = X86ISD::AESENC128KL;
27338         break;
27339       case Intrinsic::x86_aesdec128kl:
27340         Opcode = X86ISD::AESDEC128KL;
27341         break;
27342       case Intrinsic::x86_aesenc256kl:
27343         Opcode = X86ISD::AESENC256KL;
27344         break;
27345       case Intrinsic::x86_aesdec256kl:
27346         Opcode = X86ISD::AESDEC256KL;
27347         break;
27348       }
27349 
27350       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27351       MachineMemOperand *MMO = MemIntr->getMemOperand();
27352       EVT MemVT = MemIntr->getMemoryVT();
27353       SDValue Operation = DAG.getMemIntrinsicNode(
27354           Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27355           MMO);
27356       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27357 
27358       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27359                          {ZF, Operation.getValue(0), Operation.getValue(2)});
27360     }
27361     case Intrinsic::x86_aesencwide128kl:
27362     case Intrinsic::x86_aesdecwide128kl:
27363     case Intrinsic::x86_aesencwide256kl:
27364     case Intrinsic::x86_aesdecwide256kl: {
27365       SDLoc DL(Op);
27366       SDVTList VTs = DAG.getVTList(
27367           {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27368            MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27369       SDValue Chain = Op.getOperand(0);
27370       unsigned Opcode;
27371 
27372       switch (IntNo) {
27373       default: llvm_unreachable("Impossible intrinsic");
27374       case Intrinsic::x86_aesencwide128kl:
27375         Opcode = X86ISD::AESENCWIDE128KL;
27376         break;
27377       case Intrinsic::x86_aesdecwide128kl:
27378         Opcode = X86ISD::AESDECWIDE128KL;
27379         break;
27380       case Intrinsic::x86_aesencwide256kl:
27381         Opcode = X86ISD::AESENCWIDE256KL;
27382         break;
27383       case Intrinsic::x86_aesdecwide256kl:
27384         Opcode = X86ISD::AESDECWIDE256KL;
27385         break;
27386       }
27387 
27388       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27389       MachineMemOperand *MMO = MemIntr->getMemOperand();
27390       EVT MemVT = MemIntr->getMemoryVT();
27391       SDValue Operation = DAG.getMemIntrinsicNode(
27392           Opcode, DL, VTs,
27393           {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27394            Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27395            Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27396           MemVT, MMO);
27397       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27398 
27399       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27400                          {ZF, Operation.getValue(1), Operation.getValue(2),
27401                           Operation.getValue(3), Operation.getValue(4),
27402                           Operation.getValue(5), Operation.getValue(6),
27403                           Operation.getValue(7), Operation.getValue(8),
27404                           Operation.getValue(9)});
27405     }
27406     case Intrinsic::x86_testui: {
27407       SDLoc dl(Op);
27408       SDValue Chain = Op.getOperand(0);
27409       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27410       SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27411       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27412       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27413                          Operation.getValue(1));
27414     }
27415     }
27416     return SDValue();
27417   }
27418 
27419   SDLoc dl(Op);
27420   switch(IntrData->Type) {
27421   default: llvm_unreachable("Unknown Intrinsic Type");
27422   case RDSEED:
27423   case RDRAND: {
27424     // Emit the node with the right value type.
27425     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27426     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27427 
27428     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27429     // Otherwise return the value from Rand, which is always 0, casted to i32.
27430     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27431                      DAG.getConstant(1, dl, Op->getValueType(1)),
27432                      DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27433                      SDValue(Result.getNode(), 1)};
27434     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27435 
27436     // Return { result, isValid, chain }.
27437     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27438                        SDValue(Result.getNode(), 2));
27439   }
27440   case GATHER_AVX2: {
27441     SDValue Chain = Op.getOperand(0);
27442     SDValue Src   = Op.getOperand(2);
27443     SDValue Base  = Op.getOperand(3);
27444     SDValue Index = Op.getOperand(4);
27445     SDValue Mask  = Op.getOperand(5);
27446     SDValue Scale = Op.getOperand(6);
27447     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27448                              Scale, Chain, Subtarget);
27449   }
27450   case GATHER: {
27451   //gather(v1, mask, index, base, scale);
27452     SDValue Chain = Op.getOperand(0);
27453     SDValue Src   = Op.getOperand(2);
27454     SDValue Base  = Op.getOperand(3);
27455     SDValue Index = Op.getOperand(4);
27456     SDValue Mask  = Op.getOperand(5);
27457     SDValue Scale = Op.getOperand(6);
27458     return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27459                          Chain, Subtarget);
27460   }
27461   case SCATTER: {
27462   //scatter(base, mask, index, v1, scale);
27463     SDValue Chain = Op.getOperand(0);
27464     SDValue Base  = Op.getOperand(2);
27465     SDValue Mask  = Op.getOperand(3);
27466     SDValue Index = Op.getOperand(4);
27467     SDValue Src   = Op.getOperand(5);
27468     SDValue Scale = Op.getOperand(6);
27469     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27470                           Scale, Chain, Subtarget);
27471   }
27472   case PREFETCH: {
27473     const APInt &HintVal = Op.getConstantOperandAPInt(6);
27474     assert((HintVal == 2 || HintVal == 3) &&
27475            "Wrong prefetch hint in intrinsic: should be 2 or 3");
27476     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27477     SDValue Chain = Op.getOperand(0);
27478     SDValue Mask  = Op.getOperand(2);
27479     SDValue Index = Op.getOperand(3);
27480     SDValue Base  = Op.getOperand(4);
27481     SDValue Scale = Op.getOperand(5);
27482     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27483                            Subtarget);
27484   }
27485   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27486   case RDTSC: {
27487     SmallVector<SDValue, 2> Results;
27488     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27489                             Results);
27490     return DAG.getMergeValues(Results, dl);
27491   }
27492   // Read Performance Monitoring Counters.
27493   case RDPMC:
27494   // GetExtended Control Register.
27495   case XGETBV: {
27496     SmallVector<SDValue, 2> Results;
27497 
27498     // RDPMC uses ECX to select the index of the performance counter to read.
27499     // XGETBV uses ECX to select the index of the XCR register to return.
27500     // The result is stored into registers EDX:EAX.
27501     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27502                                 Subtarget, Results);
27503     return DAG.getMergeValues(Results, dl);
27504   }
27505   // XTEST intrinsics.
27506   case XTEST: {
27507     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27508     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27509 
27510     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27511     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27512     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27513                        Ret, SDValue(InTrans.getNode(), 1));
27514   }
27515   case TRUNCATE_TO_MEM_VI8:
27516   case TRUNCATE_TO_MEM_VI16:
27517   case TRUNCATE_TO_MEM_VI32: {
27518     SDValue Mask = Op.getOperand(4);
27519     SDValue DataToTruncate = Op.getOperand(3);
27520     SDValue Addr = Op.getOperand(2);
27521     SDValue Chain = Op.getOperand(0);
27522 
27523     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27524     assert(MemIntr && "Expected MemIntrinsicSDNode!");
27525 
27526     EVT MemVT  = MemIntr->getMemoryVT();
27527 
27528     uint16_t TruncationOp = IntrData->Opc0;
27529     switch (TruncationOp) {
27530     case X86ISD::VTRUNC: {
27531       if (isAllOnesConstant(Mask)) // return just a truncate store
27532         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27533                                  MemIntr->getMemOperand());
27534 
27535       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27536       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27537       SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27538 
27539       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27540                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27541                                 true /* truncating */);
27542     }
27543     case X86ISD::VTRUNCUS:
27544     case X86ISD::VTRUNCS: {
27545       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27546       if (isAllOnesConstant(Mask))
27547         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27548                                MemIntr->getMemOperand(), DAG);
27549 
27550       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27551       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27552 
27553       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27554                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
27555     }
27556     default:
27557       llvm_unreachable("Unsupported truncstore intrinsic");
27558     }
27559   }
27560   }
27561 }
27562 
27563 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27564                                            SelectionDAG &DAG) const {
27565   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27566   MFI.setReturnAddressIsTaken(true);
27567 
27568   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
27569     return SDValue();
27570 
27571   unsigned Depth = Op.getConstantOperandVal(0);
27572   SDLoc dl(Op);
27573   EVT PtrVT = getPointerTy(DAG.getDataLayout());
27574 
27575   if (Depth > 0) {
27576     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27577     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27578     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27579     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27580                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27581                        MachinePointerInfo());
27582   }
27583 
27584   // Just load the return address.
27585   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27586   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27587                      MachinePointerInfo());
27588 }
27589 
27590 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27591                                                  SelectionDAG &DAG) const {
27592   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
27593   return getReturnAddressFrameIndex(DAG);
27594 }
27595 
27596 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27597   MachineFunction &MF = DAG.getMachineFunction();
27598   MachineFrameInfo &MFI = MF.getFrameInfo();
27599   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27600   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27601   EVT VT = Op.getValueType();
27602 
27603   MFI.setFrameAddressIsTaken(true);
27604 
27605   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27606     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
27607     // is not possible to crawl up the stack without looking at the unwind codes
27608     // simultaneously.
27609     int FrameAddrIndex = FuncInfo->getFAIndex();
27610     if (!FrameAddrIndex) {
27611       // Set up a frame object for the return address.
27612       unsigned SlotSize = RegInfo->getSlotSize();
27613       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27614           SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27615       FuncInfo->setFAIndex(FrameAddrIndex);
27616     }
27617     return DAG.getFrameIndex(FrameAddrIndex, VT);
27618   }
27619 
27620   unsigned FrameReg =
27621       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27622   SDLoc dl(Op);  // FIXME probably not meaningful
27623   unsigned Depth = Op.getConstantOperandVal(0);
27624   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27625           (FrameReg == X86::EBP && VT == MVT::i32)) &&
27626          "Invalid Frame Register!");
27627   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27628   while (Depth--)
27629     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27630                             MachinePointerInfo());
27631   return FrameAddr;
27632 }
27633 
27634 // FIXME? Maybe this could be a TableGen attribute on some registers and
27635 // this table could be generated automatically from RegInfo.
27636 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27637                                               const MachineFunction &MF) const {
27638   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27639 
27640   Register Reg = StringSwitch<unsigned>(RegName)
27641                        .Case("esp", X86::ESP)
27642                        .Case("rsp", X86::RSP)
27643                        .Case("ebp", X86::EBP)
27644                        .Case("rbp", X86::RBP)
27645                        .Default(0);
27646 
27647   if (Reg == X86::EBP || Reg == X86::RBP) {
27648     if (!TFI.hasFP(MF))
27649       report_fatal_error("register " + StringRef(RegName) +
27650                          " is allocatable: function has no frame pointer");
27651 #ifndef NDEBUG
27652     else {
27653       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27654       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27655       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27656              "Invalid Frame Register!");
27657     }
27658 #endif
27659   }
27660 
27661   if (Reg)
27662     return Reg;
27663 
27664   report_fatal_error("Invalid register name global variable");
27665 }
27666 
27667 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27668                                                      SelectionDAG &DAG) const {
27669   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27670   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27671 }
27672 
27673 Register X86TargetLowering::getExceptionPointerRegister(
27674     const Constant *PersonalityFn) const {
27675   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27676     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27677 
27678   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27679 }
27680 
27681 Register X86TargetLowering::getExceptionSelectorRegister(
27682     const Constant *PersonalityFn) const {
27683   // Funclet personalities don't use selectors (the runtime does the selection).
27684   if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27685     return X86::NoRegister;
27686   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27687 }
27688 
27689 bool X86TargetLowering::needsFixedCatchObjects() const {
27690   return Subtarget.isTargetWin64();
27691 }
27692 
27693 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27694   SDValue Chain     = Op.getOperand(0);
27695   SDValue Offset    = Op.getOperand(1);
27696   SDValue Handler   = Op.getOperand(2);
27697   SDLoc dl      (Op);
27698 
27699   EVT PtrVT = getPointerTy(DAG.getDataLayout());
27700   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27701   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27702   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27703           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27704          "Invalid Frame Register!");
27705   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27706   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27707 
27708   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27709                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27710                                                        dl));
27711   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27712   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27713   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27714 
27715   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27716                      DAG.getRegister(StoreAddrReg, PtrVT));
27717 }
27718 
27719 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27720                                                SelectionDAG &DAG) const {
27721   SDLoc DL(Op);
27722   // If the subtarget is not 64bit, we may need the global base reg
27723   // after isel expand pseudo, i.e., after CGBR pass ran.
27724   // Therefore, ask for the GlobalBaseReg now, so that the pass
27725   // inserts the code for us in case we need it.
27726   // Otherwise, we will end up in a situation where we will
27727   // reference a virtual register that is not defined!
27728   if (!Subtarget.is64Bit()) {
27729     const X86InstrInfo *TII = Subtarget.getInstrInfo();
27730     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27731   }
27732   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27733                      DAG.getVTList(MVT::i32, MVT::Other),
27734                      Op.getOperand(0), Op.getOperand(1));
27735 }
27736 
27737 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27738                                                 SelectionDAG &DAG) const {
27739   SDLoc DL(Op);
27740   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27741                      Op.getOperand(0), Op.getOperand(1));
27742 }
27743 
27744 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27745                                                        SelectionDAG &DAG) const {
27746   SDLoc DL(Op);
27747   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27748                      Op.getOperand(0));
27749 }
27750 
27751 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27752   return Op.getOperand(0);
27753 }
27754 
27755 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27756                                                 SelectionDAG &DAG) const {
27757   SDValue Root = Op.getOperand(0);
27758   SDValue Trmp = Op.getOperand(1); // trampoline
27759   SDValue FPtr = Op.getOperand(2); // nested function
27760   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27761   SDLoc dl (Op);
27762 
27763   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27764   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27765 
27766   if (Subtarget.is64Bit()) {
27767     SDValue OutChains[6];
27768 
27769     // Large code-model.
27770     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
27771     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27772 
27773     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27774     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27775 
27776     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27777 
27778     // Load the pointer to the nested function into R11.
27779     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27780     SDValue Addr = Trmp;
27781     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27782                                 Addr, MachinePointerInfo(TrmpAddr));
27783 
27784     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27785                        DAG.getConstant(2, dl, MVT::i64));
27786     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27787                                 MachinePointerInfo(TrmpAddr, 2), Align(2));
27788 
27789     // Load the 'nest' parameter value into R10.
27790     // R10 is specified in X86CallingConv.td
27791     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27792     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27793                        DAG.getConstant(10, dl, MVT::i64));
27794     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27795                                 Addr, MachinePointerInfo(TrmpAddr, 10));
27796 
27797     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27798                        DAG.getConstant(12, dl, MVT::i64));
27799     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27800                                 MachinePointerInfo(TrmpAddr, 12), Align(2));
27801 
27802     // Jump to the nested function.
27803     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27804     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27805                        DAG.getConstant(20, dl, MVT::i64));
27806     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27807                                 Addr, MachinePointerInfo(TrmpAddr, 20));
27808 
27809     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27810     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27811                        DAG.getConstant(22, dl, MVT::i64));
27812     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27813                                 Addr, MachinePointerInfo(TrmpAddr, 22));
27814 
27815     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27816   } else {
27817     const Function *Func =
27818       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27819     CallingConv::ID CC = Func->getCallingConv();
27820     unsigned NestReg;
27821 
27822     switch (CC) {
27823     default:
27824       llvm_unreachable("Unsupported calling convention");
27825     case CallingConv::C:
27826     case CallingConv::X86_StdCall: {
27827       // Pass 'nest' parameter in ECX.
27828       // Must be kept in sync with X86CallingConv.td
27829       NestReg = X86::ECX;
27830 
27831       // Check that ECX wasn't needed by an 'inreg' parameter.
27832       FunctionType *FTy = Func->getFunctionType();
27833       const AttributeList &Attrs = Func->getAttributes();
27834 
27835       if (!Attrs.isEmpty() && !Func->isVarArg()) {
27836         unsigned InRegCount = 0;
27837         unsigned Idx = 0;
27838 
27839         for (FunctionType::param_iterator I = FTy->param_begin(),
27840              E = FTy->param_end(); I != E; ++I, ++Idx)
27841           if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27842             const DataLayout &DL = DAG.getDataLayout();
27843             // FIXME: should only count parameters that are lowered to integers.
27844             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27845           }
27846 
27847         if (InRegCount > 2) {
27848           report_fatal_error("Nest register in use - reduce number of inreg"
27849                              " parameters!");
27850         }
27851       }
27852       break;
27853     }
27854     case CallingConv::X86_FastCall:
27855     case CallingConv::X86_ThisCall:
27856     case CallingConv::Fast:
27857     case CallingConv::Tail:
27858     case CallingConv::SwiftTail:
27859       // Pass 'nest' parameter in EAX.
27860       // Must be kept in sync with X86CallingConv.td
27861       NestReg = X86::EAX;
27862       break;
27863     }
27864 
27865     SDValue OutChains[4];
27866     SDValue Addr, Disp;
27867 
27868     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27869                        DAG.getConstant(10, dl, MVT::i32));
27870     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27871 
27872     // This is storing the opcode for MOV32ri.
27873     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27874     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27875     OutChains[0] =
27876         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27877                      Trmp, MachinePointerInfo(TrmpAddr));
27878 
27879     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27880                        DAG.getConstant(1, dl, MVT::i32));
27881     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27882                                 MachinePointerInfo(TrmpAddr, 1), Align(1));
27883 
27884     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27885     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27886                        DAG.getConstant(5, dl, MVT::i32));
27887     OutChains[2] =
27888         DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27889                      MachinePointerInfo(TrmpAddr, 5), Align(1));
27890 
27891     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27892                        DAG.getConstant(6, dl, MVT::i32));
27893     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27894                                 MachinePointerInfo(TrmpAddr, 6), Align(1));
27895 
27896     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27897   }
27898 }
27899 
27900 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27901                                             SelectionDAG &DAG) const {
27902   /*
27903    The rounding mode is in bits 11:10 of FPSR, and has the following
27904    settings:
27905      00 Round to nearest
27906      01 Round to -inf
27907      10 Round to +inf
27908      11 Round to 0
27909 
27910   FLT_ROUNDS, on the other hand, expects the following:
27911     -1 Undefined
27912      0 Round to 0
27913      1 Round to nearest
27914      2 Round to +inf
27915      3 Round to -inf
27916 
27917   To perform the conversion, we use a packed lookup table of the four 2-bit
27918   values that we can index by FPSP[11:10]
27919     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27920 
27921     (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27922   */
27923 
27924   MachineFunction &MF = DAG.getMachineFunction();
27925   MVT VT = Op.getSimpleValueType();
27926   SDLoc DL(Op);
27927 
27928   // Save FP Control Word to stack slot
27929   int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27930   SDValue StackSlot =
27931       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27932 
27933   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27934 
27935   SDValue Chain = Op.getOperand(0);
27936   SDValue Ops[] = {Chain, StackSlot};
27937   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27938                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27939                                   Align(2), MachineMemOperand::MOStore);
27940 
27941   // Load FP Control Word from stack slot
27942   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27943   Chain = CWD.getValue(1);
27944 
27945   // Mask and turn the control bits into a shift for the lookup table.
27946   SDValue Shift =
27947     DAG.getNode(ISD::SRL, DL, MVT::i16,
27948                 DAG.getNode(ISD::AND, DL, MVT::i16,
27949                             CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27950                 DAG.getConstant(9, DL, MVT::i8));
27951   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27952 
27953   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27954   SDValue RetVal =
27955     DAG.getNode(ISD::AND, DL, MVT::i32,
27956                 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27957                 DAG.getConstant(3, DL, MVT::i32));
27958 
27959   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27960 
27961   return DAG.getMergeValues({RetVal, Chain}, DL);
27962 }
27963 
27964 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27965                                              SelectionDAG &DAG) const {
27966   MachineFunction &MF = DAG.getMachineFunction();
27967   SDLoc DL(Op);
27968   SDValue Chain = Op.getNode()->getOperand(0);
27969 
27970   // FP control word may be set only from data in memory. So we need to allocate
27971   // stack space to save/load FP control word.
27972   int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27973   SDValue StackSlot =
27974       DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27975   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27976   MachineMemOperand *MMO =
27977       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27978 
27979   // Store FP control word into memory.
27980   SDValue Ops[] = {Chain, StackSlot};
27981   Chain = DAG.getMemIntrinsicNode(
27982       X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27983 
27984   // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27985   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27986   Chain = CWD.getValue(1);
27987   CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27988                     DAG.getConstant(0xf3ff, DL, MVT::i16));
27989 
27990   // Calculate new rounding mode.
27991   SDValue NewRM = Op.getNode()->getOperand(1);
27992   SDValue RMBits;
27993   if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27994     uint64_t RM = CVal->getZExtValue();
27995     int FieldVal;
27996     switch (static_cast<RoundingMode>(RM)) {
27997     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27998     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
27999     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
28000     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
28001     default:
28002       llvm_unreachable("rounding mode is not supported by X86 hardware");
28003     }
28004     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28005   } else {
28006     // Need to convert argument into bits of control word:
28007     //    0 Round to 0       -> 11
28008     //    1 Round to nearest -> 00
28009     //    2 Round to +inf    -> 10
28010     //    3 Round to -inf    -> 01
28011     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28012     // To make the conversion, put all these values into a value 0xc9 and shift
28013     // it left depending on the rounding mode:
28014     //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28015     //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
28016     //    ...
28017     // (0xc9 << (2 * NewRM + 4)) & 0xc00
28018     SDValue ShiftValue =
28019         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28020                     DAG.getNode(ISD::ADD, DL, MVT::i32,
28021                                 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28022                                             DAG.getConstant(1, DL, MVT::i8)),
28023                                 DAG.getConstant(4, DL, MVT::i32)));
28024     SDValue Shifted =
28025         DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28026                     ShiftValue);
28027     RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28028                          DAG.getConstant(0xc00, DL, MVT::i16));
28029   }
28030 
28031   // Update rounding mode bits and store the new FP Control Word into stack.
28032   CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28033   Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
28034 
28035   // Load FP control word from the slot.
28036   SDValue OpsLD[] = {Chain, StackSlot};
28037   MachineMemOperand *MMOL =
28038       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
28039   Chain = DAG.getMemIntrinsicNode(
28040       X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28041 
28042   // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28043   // same way but in bits 14:13.
28044   if (Subtarget.hasSSE1()) {
28045     // Store MXCSR into memory.
28046     Chain = DAG.getNode(
28047         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28048         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28049         StackSlot);
28050 
28051     // Load MXCSR from stack slot and clear RM field (bits 14:13).
28052     SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28053     Chain = CWD.getValue(1);
28054     CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28055                       DAG.getConstant(0xffff9fff, DL, MVT::i32));
28056 
28057     // Shift X87 RM bits from 11:10 to 14:13.
28058     RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28059     RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28060                          DAG.getConstant(3, DL, MVT::i8));
28061 
28062     // Update rounding mode bits and store the new FP Control Word into stack.
28063     CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28064     Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
28065 
28066     // Load MXCSR from the slot.
28067     Chain = DAG.getNode(
28068         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28069         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28070         StackSlot);
28071   }
28072 
28073   return Chain;
28074 }
28075 
28076 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
28077 //
28078 // i8/i16 vector implemented using dword LZCNT vector instruction
28079 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28080 // split the vector, perform operation on it's Lo a Hi part and
28081 // concatenate the results.
28082 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
28083                                          const X86Subtarget &Subtarget) {
28084   assert(Op.getOpcode() == ISD::CTLZ);
28085   SDLoc dl(Op);
28086   MVT VT = Op.getSimpleValueType();
28087   MVT EltVT = VT.getVectorElementType();
28088   unsigned NumElems = VT.getVectorNumElements();
28089 
28090   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28091           "Unsupported element type");
28092 
28093   // Split vector, it's Lo and Hi parts will be handled in next iteration.
28094   if (NumElems > 16 ||
28095       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28096     return splitVectorIntUnary(Op, DAG);
28097 
28098   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28099   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28100           "Unsupported value type for operation");
28101 
28102   // Use native supported vector instruction vplzcntd.
28103   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28104   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28105   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28106   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28107 
28108   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28109 }
28110 
28111 // Lower CTLZ using a PSHUFB lookup table implementation.
28112 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
28113                                        const X86Subtarget &Subtarget,
28114                                        SelectionDAG &DAG) {
28115   MVT VT = Op.getSimpleValueType();
28116   int NumElts = VT.getVectorNumElements();
28117   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28118   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28119 
28120   // Per-nibble leading zero PSHUFB lookup table.
28121   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28122                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28123                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28124                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28125 
28126   SmallVector<SDValue, 64> LUTVec;
28127   for (int i = 0; i < NumBytes; ++i)
28128     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28129   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28130 
28131   // Begin by bitcasting the input to byte vector, then split those bytes
28132   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
28133   // If the hi input nibble is zero then we add both results together, otherwise
28134   // we just take the hi result (by masking the lo result to zero before the
28135   // add).
28136   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28137   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28138 
28139   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28140   SDValue Lo = Op0;
28141   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28142   SDValue HiZ;
28143   if (CurrVT.is512BitVector()) {
28144     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28145     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28146     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28147   } else {
28148     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28149   }
28150 
28151   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28152   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28153   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28154   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28155 
28156   // Merge result back from vXi8 back to VT, working on the lo/hi halves
28157   // of the current vector width in the same way we did for the nibbles.
28158   // If the upper half of the input element is zero then add the halves'
28159   // leading zero counts together, otherwise just use the upper half's.
28160   // Double the width of the result until we are at target width.
28161   while (CurrVT != VT) {
28162     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28163     int CurrNumElts = CurrVT.getVectorNumElements();
28164     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28165     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28166     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28167 
28168     // Check if the upper half of the input element is zero.
28169     if (CurrVT.is512BitVector()) {
28170       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28171       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28172                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28173       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28174     } else {
28175       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28176                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28177     }
28178     HiZ = DAG.getBitcast(NextVT, HiZ);
28179 
28180     // Move the upper/lower halves to the lower bits as we'll be extending to
28181     // NextVT. Mask the lower result to zero if HiZ is true and add the results
28182     // together.
28183     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28184     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28185     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28186     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28187     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28188     CurrVT = NextVT;
28189   }
28190 
28191   return Res;
28192 }
28193 
28194 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
28195                                const X86Subtarget &Subtarget,
28196                                SelectionDAG &DAG) {
28197   MVT VT = Op.getSimpleValueType();
28198 
28199   if (Subtarget.hasCDI() &&
28200       // vXi8 vectors need to be promoted to 512-bits for vXi32.
28201       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28202     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28203 
28204   // Decompose 256-bit ops into smaller 128-bit ops.
28205   if (VT.is256BitVector() && !Subtarget.hasInt256())
28206     return splitVectorIntUnary(Op, DAG);
28207 
28208   // Decompose 512-bit ops into smaller 256-bit ops.
28209   if (VT.is512BitVector() && !Subtarget.hasBWI())
28210     return splitVectorIntUnary(Op, DAG);
28211 
28212   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28213   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28214 }
28215 
28216 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28217                          SelectionDAG &DAG) {
28218   MVT VT = Op.getSimpleValueType();
28219   MVT OpVT = VT;
28220   unsigned NumBits = VT.getSizeInBits();
28221   SDLoc dl(Op);
28222   unsigned Opc = Op.getOpcode();
28223 
28224   if (VT.isVector())
28225     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28226 
28227   Op = Op.getOperand(0);
28228   if (VT == MVT::i8) {
28229     // Zero extend to i32 since there is not an i8 bsr.
28230     OpVT = MVT::i32;
28231     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28232   }
28233 
28234   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28235   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28236   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28237 
28238   if (Opc == ISD::CTLZ) {
28239     // If src is zero (i.e. bsr sets ZF), returns NumBits.
28240     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28241                      DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28242                      Op.getValue(1)};
28243     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28244   }
28245 
28246   // Finally xor with NumBits-1.
28247   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28248                    DAG.getConstant(NumBits - 1, dl, OpVT));
28249 
28250   if (VT == MVT::i8)
28251     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28252   return Op;
28253 }
28254 
28255 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28256                          SelectionDAG &DAG) {
28257   MVT VT = Op.getSimpleValueType();
28258   unsigned NumBits = VT.getScalarSizeInBits();
28259   SDValue N0 = Op.getOperand(0);
28260   SDLoc dl(Op);
28261 
28262   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28263          "Only scalar CTTZ requires custom lowering");
28264 
28265   // Issue a bsf (scan bits forward) which also sets EFLAGS.
28266   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28267   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28268 
28269   // If src is zero (i.e. bsf sets ZF), returns NumBits.
28270   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28271                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28272                    Op.getValue(1)};
28273   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28274 }
28275 
28276 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
28277                            const X86Subtarget &Subtarget) {
28278   MVT VT = Op.getSimpleValueType();
28279   if (VT == MVT::i16 || VT == MVT::i32)
28280     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
28281 
28282   if (VT == MVT::v32i16 || VT == MVT::v64i8)
28283     return splitVectorIntBinary(Op, DAG);
28284 
28285   assert(Op.getSimpleValueType().is256BitVector() &&
28286          Op.getSimpleValueType().isInteger() &&
28287          "Only handle AVX 256-bit vector integer operation");
28288   return splitVectorIntBinary(Op, DAG);
28289 }
28290 
28291 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
28292                                   const X86Subtarget &Subtarget) {
28293   MVT VT = Op.getSimpleValueType();
28294   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28295   unsigned Opcode = Op.getOpcode();
28296   SDLoc DL(Op);
28297 
28298   if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28299       (VT.is256BitVector() && !Subtarget.hasInt256())) {
28300     assert(Op.getSimpleValueType().isInteger() &&
28301            "Only handle AVX vector integer operation");
28302     return splitVectorIntBinary(Op, DAG);
28303   }
28304 
28305   // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28306   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28307   EVT SetCCResultType =
28308       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28309 
28310   unsigned BitWidth = VT.getScalarSizeInBits();
28311   if (Opcode == ISD::USUBSAT) {
28312     if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28313       // Handle a special-case with a bit-hack instead of cmp+select:
28314       // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28315       // If the target can use VPTERNLOG, DAGToDAG will match this as
28316       // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28317       // "broadcast" constant load.
28318       ConstantSDNode *C = isConstOrConstSplat(Y, true);
28319       if (C && C->getAPIntValue().isSignMask()) {
28320         SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28321         SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28322         SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28323         SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28324         return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28325       }
28326     }
28327     if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28328       // usubsat X, Y --> (X >u Y) ? X - Y : 0
28329       SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28330       SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28331       // TODO: Move this to DAGCombiner?
28332       if (SetCCResultType == VT &&
28333           DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28334         return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28335       return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28336     }
28337   }
28338 
28339   if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28340       (!VT.isVector() || VT == MVT::v2i64)) {
28341     APInt MinVal = APInt::getSignedMinValue(BitWidth);
28342     APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
28343     SDValue Zero = DAG.getConstant(0, DL, VT);
28344     SDValue Result =
28345         DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28346                     DAG.getVTList(VT, SetCCResultType), X, Y);
28347     SDValue SumDiff = Result.getValue(0);
28348     SDValue Overflow = Result.getValue(1);
28349     SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28350     SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28351     SDValue SumNeg =
28352         DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28353     Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28354     return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28355   }
28356 
28357   // Use default expansion.
28358   return SDValue();
28359 }
28360 
28361 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28362                         SelectionDAG &DAG) {
28363   MVT VT = Op.getSimpleValueType();
28364   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28365     // Since X86 does not have CMOV for 8-bit integer, we don't convert
28366     // 8-bit integer abs to NEG and CMOV.
28367     SDLoc DL(Op);
28368     SDValue N0 = Op.getOperand(0);
28369     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28370                               DAG.getConstant(0, DL, VT), N0);
28371     SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28372                      SDValue(Neg.getNode(), 1)};
28373     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28374   }
28375 
28376   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28377   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28378     SDLoc DL(Op);
28379     SDValue Src = Op.getOperand(0);
28380     SDValue Sub =
28381         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
28382     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
28383   }
28384 
28385   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28386     assert(VT.isInteger() &&
28387            "Only handle AVX 256-bit vector integer operation");
28388     return splitVectorIntUnary(Op, DAG);
28389   }
28390 
28391   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28392     return splitVectorIntUnary(Op, DAG);
28393 
28394   // Default to expand.
28395   return SDValue();
28396 }
28397 
28398 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
28399   MVT VT = Op.getSimpleValueType();
28400 
28401   // For AVX1 cases, split to use legal ops (everything but v4i64).
28402   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
28403     return splitVectorIntBinary(Op, DAG);
28404 
28405   if (VT == MVT::v32i16 || VT == MVT::v64i8)
28406     return splitVectorIntBinary(Op, DAG);
28407 
28408   // Default to expand.
28409   return SDValue();
28410 }
28411 
28412 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28413                         SelectionDAG &DAG) {
28414   SDLoc dl(Op);
28415   MVT VT = Op.getSimpleValueType();
28416 
28417   // Decompose 256-bit ops into 128-bit ops.
28418   if (VT.is256BitVector() && !Subtarget.hasInt256())
28419     return splitVectorIntBinary(Op, DAG);
28420 
28421   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28422     return splitVectorIntBinary(Op, DAG);
28423 
28424   SDValue A = Op.getOperand(0);
28425   SDValue B = Op.getOperand(1);
28426 
28427   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28428   // vector pairs, multiply and truncate.
28429   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28430     unsigned NumElts = VT.getVectorNumElements();
28431 
28432     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28433         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28434       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28435       return DAG.getNode(
28436           ISD::TRUNCATE, dl, VT,
28437           DAG.getNode(ISD::MUL, dl, ExVT,
28438                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28439                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28440     }
28441 
28442     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28443 
28444     // Extract the lo/hi parts to any extend to i16.
28445     // We're going to mask off the low byte of each result element of the
28446     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28447     // element.
28448     SDValue Undef = DAG.getUNDEF(VT);
28449     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28450     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28451 
28452     SDValue BLo, BHi;
28453     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28454       // If the RHS is a constant, manually unpackl/unpackh.
28455       SmallVector<SDValue, 16> LoOps, HiOps;
28456       for (unsigned i = 0; i != NumElts; i += 16) {
28457         for (unsigned j = 0; j != 8; ++j) {
28458           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28459                                                MVT::i16));
28460           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28461                                                MVT::i16));
28462         }
28463       }
28464 
28465       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28466       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28467     } else {
28468       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28469       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28470     }
28471 
28472     // Multiply, mask the lower 8bits of the lo/hi results and pack.
28473     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28474     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28475     return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28476   }
28477 
28478   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28479   if (VT == MVT::v4i32) {
28480     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28481            "Should not custom lower when pmulld is available!");
28482 
28483     // Extract the odd parts.
28484     static const int UnpackMask[] = { 1, -1, 3, -1 };
28485     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28486     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28487 
28488     // Multiply the even parts.
28489     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28490                                 DAG.getBitcast(MVT::v2i64, A),
28491                                 DAG.getBitcast(MVT::v2i64, B));
28492     // Now multiply odd parts.
28493     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28494                                DAG.getBitcast(MVT::v2i64, Aodds),
28495                                DAG.getBitcast(MVT::v2i64, Bodds));
28496 
28497     Evens = DAG.getBitcast(VT, Evens);
28498     Odds = DAG.getBitcast(VT, Odds);
28499 
28500     // Merge the two vectors back together with a shuffle. This expands into 2
28501     // shuffles.
28502     static const int ShufMask[] = { 0, 4, 2, 6 };
28503     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28504   }
28505 
28506   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28507          "Only know how to lower V2I64/V4I64/V8I64 multiply");
28508   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28509 
28510   //  Ahi = psrlqi(a, 32);
28511   //  Bhi = psrlqi(b, 32);
28512   //
28513   //  AloBlo = pmuludq(a, b);
28514   //  AloBhi = pmuludq(a, Bhi);
28515   //  AhiBlo = pmuludq(Ahi, b);
28516   //
28517   //  Hi = psllqi(AloBhi + AhiBlo, 32);
28518   //  return AloBlo + Hi;
28519   KnownBits AKnown = DAG.computeKnownBits(A);
28520   KnownBits BKnown = DAG.computeKnownBits(B);
28521 
28522   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28523   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28524   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28525 
28526   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28527   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28528   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28529 
28530   SDValue Zero = DAG.getConstant(0, dl, VT);
28531 
28532   // Only multiply lo/hi halves that aren't known to be zero.
28533   SDValue AloBlo = Zero;
28534   if (!ALoIsZero && !BLoIsZero)
28535     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28536 
28537   SDValue AloBhi = Zero;
28538   if (!ALoIsZero && !BHiIsZero) {
28539     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28540     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28541   }
28542 
28543   SDValue AhiBlo = Zero;
28544   if (!AHiIsZero && !BLoIsZero) {
28545     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28546     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28547   }
28548 
28549   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28550   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28551 
28552   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28553 }
28554 
28555 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28556                                      MVT VT, bool IsSigned,
28557                                      const X86Subtarget &Subtarget,
28558                                      SelectionDAG &DAG,
28559                                      SDValue *Low = nullptr) {
28560   unsigned NumElts = VT.getVectorNumElements();
28561 
28562   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28563   // to a vXi16 type. Do the multiplies, shift the results and pack the half
28564   // lane results back together.
28565 
28566   // We'll take different approaches for signed and unsigned.
28567   // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28568   // and use pmullw to calculate the full 16-bit product.
28569   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28570   // shift them left into the upper byte of each word. This allows us to use
28571   // pmulhw to calculate the full 16-bit product. This trick means we don't
28572   // need to sign extend the bytes to use pmullw.
28573 
28574   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28575   SDValue Zero = DAG.getConstant(0, dl, VT);
28576 
28577   SDValue ALo, AHi;
28578   if (IsSigned) {
28579     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28580     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28581   } else {
28582     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28583     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28584   }
28585 
28586   SDValue BLo, BHi;
28587   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28588     // If the RHS is a constant, manually unpackl/unpackh and extend.
28589     SmallVector<SDValue, 16> LoOps, HiOps;
28590     for (unsigned i = 0; i != NumElts; i += 16) {
28591       for (unsigned j = 0; j != 8; ++j) {
28592         SDValue LoOp = B.getOperand(i + j);
28593         SDValue HiOp = B.getOperand(i + j + 8);
28594 
28595         if (IsSigned) {
28596           LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28597           HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28598           LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28599                              DAG.getConstant(8, dl, MVT::i16));
28600           HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28601                              DAG.getConstant(8, dl, MVT::i16));
28602         } else {
28603           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28604           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28605         }
28606 
28607         LoOps.push_back(LoOp);
28608         HiOps.push_back(HiOp);
28609       }
28610     }
28611 
28612     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28613     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28614   } else if (IsSigned) {
28615     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28616     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28617   } else {
28618     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28619     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28620   }
28621 
28622   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28623   // pack back to vXi8.
28624   unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28625   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28626   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28627 
28628   if (Low)
28629     *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28630 
28631   return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28632 }
28633 
28634 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28635                          SelectionDAG &DAG) {
28636   SDLoc dl(Op);
28637   MVT VT = Op.getSimpleValueType();
28638   bool IsSigned = Op->getOpcode() == ISD::MULHS;
28639   unsigned NumElts = VT.getVectorNumElements();
28640   SDValue A = Op.getOperand(0);
28641   SDValue B = Op.getOperand(1);
28642 
28643   // Decompose 256-bit ops into 128-bit ops.
28644   if (VT.is256BitVector() && !Subtarget.hasInt256())
28645     return splitVectorIntBinary(Op, DAG);
28646 
28647   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28648     return splitVectorIntBinary(Op, DAG);
28649 
28650   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28651     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28652            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28653            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28654 
28655     // PMULxD operations multiply each even value (starting at 0) of LHS with
28656     // the related value of RHS and produce a widen result.
28657     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28658     // => <2 x i64> <ae|cg>
28659     //
28660     // In other word, to have all the results, we need to perform two PMULxD:
28661     // 1. one with the even values.
28662     // 2. one with the odd values.
28663     // To achieve #2, with need to place the odd values at an even position.
28664     //
28665     // Place the odd value at an even position (basically, shift all values 1
28666     // step to the left):
28667     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
28668                         9, -1, 11, -1, 13, -1, 15, -1};
28669     // <a|b|c|d> => <b|undef|d|undef>
28670     SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
28671                                         makeArrayRef(&Mask[0], NumElts));
28672     // <e|f|g|h> => <f|undef|h|undef>
28673     SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
28674                                         makeArrayRef(&Mask[0], NumElts));
28675 
28676     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28677     // ints.
28678     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28679     unsigned Opcode =
28680         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28681     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28682     // => <2 x i64> <ae|cg>
28683     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28684                                                   DAG.getBitcast(MulVT, A),
28685                                                   DAG.getBitcast(MulVT, B)));
28686     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28687     // => <2 x i64> <bf|dh>
28688     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28689                                                   DAG.getBitcast(MulVT, Odd0),
28690                                                   DAG.getBitcast(MulVT, Odd1)));
28691 
28692     // Shuffle it back into the right order.
28693     SmallVector<int, 16> ShufMask(NumElts);
28694     for (int i = 0; i != (int)NumElts; ++i)
28695       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28696 
28697     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28698 
28699     // If we have a signed multiply but no PMULDQ fix up the result of an
28700     // unsigned multiply.
28701     if (IsSigned && !Subtarget.hasSSE41()) {
28702       SDValue Zero = DAG.getConstant(0, dl, VT);
28703       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28704                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28705       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28706                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28707 
28708       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28709       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28710     }
28711 
28712     return Res;
28713   }
28714 
28715   // Only i8 vectors should need custom lowering after this.
28716   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28717          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28718          "Unsupported vector type");
28719 
28720   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28721   // logical shift down the upper half and pack back to i8.
28722 
28723   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28724   // and then ashr/lshr the upper bits down to the lower bits before multiply.
28725 
28726   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28727       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28728     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28729     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28730     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28731     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28732     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28733     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28734     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28735   }
28736 
28737   return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28738 }
28739 
28740 // Custom lowering for SMULO/UMULO.
28741 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28742                          SelectionDAG &DAG) {
28743   MVT VT = Op.getSimpleValueType();
28744 
28745   // Scalars defer to LowerXALUO.
28746   if (!VT.isVector())
28747     return LowerXALUO(Op, DAG);
28748 
28749   SDLoc dl(Op);
28750   bool IsSigned = Op->getOpcode() == ISD::SMULO;
28751   SDValue A = Op.getOperand(0);
28752   SDValue B = Op.getOperand(1);
28753   EVT OvfVT = Op->getValueType(1);
28754 
28755   if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28756       (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28757     // Extract the LHS Lo/Hi vectors
28758     SDValue LHSLo, LHSHi;
28759     std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28760 
28761     // Extract the RHS Lo/Hi vectors
28762     SDValue RHSLo, RHSHi;
28763     std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28764 
28765     EVT LoOvfVT, HiOvfVT;
28766     std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28767     SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28768     SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28769 
28770     // Issue the split operations.
28771     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28772     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28773 
28774     // Join the separate data results and the overflow results.
28775     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28776     SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28777                               Hi.getValue(1));
28778 
28779     return DAG.getMergeValues({Res, Ovf}, dl);
28780   }
28781 
28782   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28783   EVT SetccVT =
28784       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28785 
28786   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28787       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28788     unsigned NumElts = VT.getVectorNumElements();
28789     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28790     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28791     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28792     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28793     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28794 
28795     SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28796 
28797     SDValue Ovf;
28798     if (IsSigned) {
28799       SDValue High, LowSign;
28800       if (OvfVT.getVectorElementType() == MVT::i1 &&
28801           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28802         // Rather the truncating try to do the compare on vXi16 or vXi32.
28803         // Shift the high down filling with sign bits.
28804         High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28805         // Fill all 16 bits with the sign bit from the low.
28806         LowSign =
28807             getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28808         LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28809                                              15, DAG);
28810         SetccVT = OvfVT;
28811         if (!Subtarget.hasBWI()) {
28812           // We can't do a vXi16 compare so sign extend to v16i32.
28813           High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28814           LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28815         }
28816       } else {
28817         // Otherwise do the compare at vXi8.
28818         High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28819         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28820         LowSign =
28821             DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28822       }
28823 
28824       Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28825     } else {
28826       SDValue High =
28827           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28828       if (OvfVT.getVectorElementType() == MVT::i1 &&
28829           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28830         // Rather the truncating try to do the compare on vXi16 or vXi32.
28831         SetccVT = OvfVT;
28832         if (!Subtarget.hasBWI()) {
28833           // We can't do a vXi16 compare so sign extend to v16i32.
28834           High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28835         }
28836       } else {
28837         // Otherwise do the compare at vXi8.
28838         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28839       }
28840 
28841       Ovf =
28842           DAG.getSetCC(dl, SetccVT, High,
28843                        DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28844     }
28845 
28846     Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28847 
28848     return DAG.getMergeValues({Low, Ovf}, dl);
28849   }
28850 
28851   SDValue Low;
28852   SDValue High =
28853       LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28854 
28855   SDValue Ovf;
28856   if (IsSigned) {
28857     // SMULO overflows if the high bits don't match the sign of the low.
28858     SDValue LowSign =
28859         DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28860     Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28861   } else {
28862     // UMULO overflows if the high bits are non-zero.
28863     Ovf =
28864         DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28865   }
28866 
28867   Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28868 
28869   return DAG.getMergeValues({Low, Ovf}, dl);
28870 }
28871 
28872 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28873   assert(Subtarget.isTargetWin64() && "Unexpected target");
28874   EVT VT = Op.getValueType();
28875   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28876          "Unexpected return type for lowering");
28877 
28878   RTLIB::Libcall LC;
28879   bool isSigned;
28880   switch (Op->getOpcode()) {
28881   default: llvm_unreachable("Unexpected request for libcall!");
28882   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
28883   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
28884   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
28885   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
28886   }
28887 
28888   SDLoc dl(Op);
28889   SDValue InChain = DAG.getEntryNode();
28890 
28891   TargetLowering::ArgListTy Args;
28892   TargetLowering::ArgListEntry Entry;
28893   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28894     EVT ArgVT = Op->getOperand(i).getValueType();
28895     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28896            "Unexpected argument type for lowering");
28897     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28898     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28899     MachinePointerInfo MPI =
28900         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28901     Entry.Node = StackPtr;
28902     InChain =
28903         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28904     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28905     Entry.Ty = PointerType::get(ArgTy,0);
28906     Entry.IsSExt = false;
28907     Entry.IsZExt = false;
28908     Args.push_back(Entry);
28909   }
28910 
28911   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28912                                          getPointerTy(DAG.getDataLayout()));
28913 
28914   TargetLowering::CallLoweringInfo CLI(DAG);
28915   CLI.setDebugLoc(dl)
28916       .setChain(InChain)
28917       .setLibCallee(
28918           getLibcallCallingConv(LC),
28919           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28920           std::move(Args))
28921       .setInRegister()
28922       .setSExtResult(isSigned)
28923       .setZExtResult(!isSigned);
28924 
28925   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28926   return DAG.getBitcast(VT, CallInfo.first);
28927 }
28928 
28929 SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28930                                                    SelectionDAG &DAG,
28931                                                    SDValue &Chain) const {
28932   assert(Subtarget.isTargetWin64() && "Unexpected target");
28933   EVT VT = Op.getValueType();
28934   bool IsStrict = Op->isStrictFPOpcode();
28935 
28936   SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28937   EVT ArgVT = Arg.getValueType();
28938 
28939   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28940          "Unexpected return type for lowering");
28941 
28942   RTLIB::Libcall LC;
28943   if (Op->getOpcode() == ISD::FP_TO_SINT ||
28944       Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28945     LC = RTLIB::getFPTOSINT(ArgVT, VT);
28946   else
28947     LC = RTLIB::getFPTOUINT(ArgVT, VT);
28948   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28949 
28950   SDLoc dl(Op);
28951   MakeLibCallOptions CallOptions;
28952   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28953 
28954   SDValue Result;
28955   // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28956   // expected VT (i128).
28957   std::tie(Result, Chain) =
28958       makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28959   Result = DAG.getBitcast(VT, Result);
28960   return Result;
28961 }
28962 
28963 SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28964                                                    SelectionDAG &DAG) const {
28965   assert(Subtarget.isTargetWin64() && "Unexpected target");
28966   EVT VT = Op.getValueType();
28967   bool IsStrict = Op->isStrictFPOpcode();
28968 
28969   SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28970   EVT ArgVT = Arg.getValueType();
28971 
28972   assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28973          "Unexpected argument type for lowering");
28974 
28975   RTLIB::Libcall LC;
28976   if (Op->getOpcode() == ISD::SINT_TO_FP ||
28977       Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
28978     LC = RTLIB::getSINTTOFP(ArgVT, VT);
28979   else
28980     LC = RTLIB::getUINTTOFP(ArgVT, VT);
28981   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28982 
28983   SDLoc dl(Op);
28984   MakeLibCallOptions CallOptions;
28985   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28986 
28987   // Pass the i128 argument as an indirect argument on the stack.
28988   SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28989   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28990   MachinePointerInfo MPI =
28991       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28992   Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
28993 
28994   SDValue Result;
28995   std::tie(Result, Chain) =
28996       makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
28997   return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
28998 }
28999 
29000 // Return true if the required (according to Opcode) shift-imm form is natively
29001 // supported by the Subtarget
29002 static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
29003                                         unsigned Opcode) {
29004   if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29005     return false;
29006 
29007   if (VT.getScalarSizeInBits() < 16)
29008     return false;
29009 
29010   if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29011       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29012     return true;
29013 
29014   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29015                 (VT.is256BitVector() && Subtarget.hasInt256());
29016 
29017   bool AShift = LShift && (Subtarget.hasAVX512() ||
29018                            (VT != MVT::v2i64 && VT != MVT::v4i64));
29019   return (Opcode == ISD::SRA) ? AShift : LShift;
29020 }
29021 
29022 // The shift amount is a variable, but it is the same for all vector lanes.
29023 // These instructions are defined together with shift-immediate.
29024 static
29025 bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
29026                                       unsigned Opcode) {
29027   return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29028 }
29029 
29030 // Return true if the required (according to Opcode) variable-shift form is
29031 // natively supported by the Subtarget
29032 static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
29033                                     unsigned Opcode) {
29034   if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29035     return false;
29036 
29037   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29038     return false;
29039 
29040   // vXi16 supported only on AVX-512, BWI
29041   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29042     return false;
29043 
29044   if (Subtarget.hasAVX512() &&
29045       (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29046     return true;
29047 
29048   bool LShift = VT.is128BitVector() || VT.is256BitVector();
29049   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
29050   return (Opcode == ISD::SRA) ? AShift : LShift;
29051 }
29052 
29053 static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
29054                                            const X86Subtarget &Subtarget) {
29055   MVT VT = Op.getSimpleValueType();
29056   SDLoc dl(Op);
29057   SDValue R = Op.getOperand(0);
29058   SDValue Amt = Op.getOperand(1);
29059   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29060 
29061   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29062     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29063     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29064     SDValue Ex = DAG.getBitcast(ExVT, R);
29065 
29066     // ashr(R, 63) === cmp_slt(R, 0)
29067     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29068       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29069              "Unsupported PCMPGT op");
29070       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29071     }
29072 
29073     if (ShiftAmt >= 32) {
29074       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29075       SDValue Upper =
29076           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29077       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
29078                                                  ShiftAmt - 32, DAG);
29079       if (VT == MVT::v2i64)
29080         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29081       if (VT == MVT::v4i64)
29082         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29083                                   {9, 1, 11, 3, 13, 5, 15, 7});
29084     } else {
29085       // SRA upper i32, SRL whole i64 and select lower i32.
29086       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
29087                                                  ShiftAmt, DAG);
29088       SDValue Lower =
29089           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29090       Lower = DAG.getBitcast(ExVT, Lower);
29091       if (VT == MVT::v2i64)
29092         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29093       if (VT == MVT::v4i64)
29094         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29095                                   {8, 1, 10, 3, 12, 5, 14, 7});
29096     }
29097     return DAG.getBitcast(VT, Ex);
29098   };
29099 
29100   // Optimize shl/srl/sra with constant shift amount.
29101   APInt APIntShiftAmt;
29102   if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29103     return SDValue();
29104 
29105   // If the shift amount is out of range, return undef.
29106   if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
29107     return DAG.getUNDEF(VT);
29108 
29109   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29110 
29111   if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
29112     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29113 
29114   // i64 SRA needs to be performed as partial shifts.
29115   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29116        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29117       Op.getOpcode() == ISD::SRA)
29118     return ArithmeticShiftRight64(ShiftAmt);
29119 
29120   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29121       (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29122     unsigned NumElts = VT.getVectorNumElements();
29123     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29124 
29125     // Simple i8 add case
29126     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29127       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29128       // must be 0). (add undef, undef) however can be any value. To make this
29129       // safe, we must freeze R to ensure that register allocation uses the same
29130       // register for an undefined value. This ensures that the result will
29131       // still be even and preserves the original semantics.
29132       R = DAG.getNode(ISD::FREEZE, dl, VT, R);
29133       return DAG.getNode(ISD::ADD, dl, VT, R, R);
29134     }
29135 
29136     // ashr(R, 7)  === cmp_slt(R, 0)
29137     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29138       SDValue Zeros = DAG.getConstant(0, dl, VT);
29139       if (VT.is512BitVector()) {
29140         assert(VT == MVT::v64i8 && "Unexpected element type!");
29141         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29142         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29143       }
29144       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29145     }
29146 
29147     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29148     if (VT == MVT::v16i8 && Subtarget.hasXOP())
29149       return SDValue();
29150 
29151     if (Op.getOpcode() == ISD::SHL) {
29152       // Make a large shift.
29153       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29154                                                ShiftAmt, DAG);
29155       SHL = DAG.getBitcast(VT, SHL);
29156       // Zero out the rightmost bits.
29157       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29158       return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29159     }
29160     if (Op.getOpcode() == ISD::SRL) {
29161       // Make a large shift.
29162       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29163                                                ShiftAmt, DAG);
29164       SRL = DAG.getBitcast(VT, SRL);
29165       // Zero out the leftmost bits.
29166       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29167       return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29168     }
29169     if (Op.getOpcode() == ISD::SRA) {
29170       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29171       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29172 
29173       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29174       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29175       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29176       return Res;
29177     }
29178     llvm_unreachable("Unknown shift opcode.");
29179   }
29180 
29181   return SDValue();
29182 }
29183 
29184 static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
29185                                           const X86Subtarget &Subtarget) {
29186   MVT VT = Op.getSimpleValueType();
29187   SDLoc dl(Op);
29188   SDValue R = Op.getOperand(0);
29189   SDValue Amt = Op.getOperand(1);
29190   unsigned Opcode = Op.getOpcode();
29191   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29192   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
29193 
29194   if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
29195     if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
29196       MVT EltVT = VT.getVectorElementType();
29197       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
29198       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
29199         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
29200       else if (EltVT.bitsLT(MVT::i32))
29201         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
29202 
29203       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
29204     }
29205 
29206     // vXi8 shifts - shift as v8i16 + mask result.
29207     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29208          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29209          VT == MVT::v64i8) &&
29210         !Subtarget.hasXOP()) {
29211       unsigned NumElts = VT.getVectorNumElements();
29212       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29213       if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29214         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29215         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29216         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
29217 
29218         // Create the mask using vXi16 shifts. For shift-rights we need to move
29219         // the upper byte down before splatting the vXi8 mask.
29220         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29221         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29222                                       BaseShAmt, Subtarget, DAG);
29223         if (Opcode != ISD::SHL)
29224           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29225                                                8, DAG);
29226         BitMask = DAG.getBitcast(VT, BitMask);
29227         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29228                                        SmallVector<int, 64>(NumElts, 0));
29229 
29230         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29231                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
29232                                           Subtarget, DAG);
29233         Res = DAG.getBitcast(VT, Res);
29234         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29235 
29236         if (Opcode == ISD::SRA) {
29237           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29238           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29239           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29240           SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
29241                                          BaseShAmt, Subtarget, DAG);
29242           SignMask = DAG.getBitcast(VT, SignMask);
29243           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29244           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29245         }
29246         return Res;
29247       }
29248     }
29249   }
29250 
29251   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
29252   if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
29253       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
29254     Amt = Amt.getOperand(0);
29255     unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
29256     std::vector<SDValue> Vals(Ratio);
29257     for (unsigned i = 0; i != Ratio; ++i)
29258       Vals[i] = Amt.getOperand(i);
29259     for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
29260       for (unsigned j = 0; j != Ratio; ++j)
29261         if (Vals[j] != Amt.getOperand(i + j))
29262           return SDValue();
29263     }
29264 
29265     if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
29266       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
29267   }
29268   return SDValue();
29269 }
29270 
29271 // Convert a shift/rotate left amount to a multiplication scale factor.
29272 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
29273                                        const X86Subtarget &Subtarget,
29274                                        SelectionDAG &DAG) {
29275   MVT VT = Amt.getSimpleValueType();
29276   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29277         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29278         (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29279         (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29280         (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29281         (Subtarget.hasBWI() && VT == MVT::v64i8)))
29282     return SDValue();
29283 
29284   MVT SVT = VT.getVectorElementType();
29285   unsigned SVTBits = SVT.getSizeInBits();
29286   unsigned NumElems = VT.getVectorNumElements();
29287 
29288   APInt UndefElts;
29289   SmallVector<APInt> EltBits;
29290   if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29291     APInt One(SVTBits, 1);
29292     SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29293     for (unsigned I = 0; I != NumElems; ++I) {
29294       if (UndefElts[I] || EltBits[I].uge(SVTBits))
29295         continue;
29296       uint64_t ShAmt = EltBits[I].getZExtValue();
29297       Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29298     }
29299     return DAG.getBuildVector(VT, dl, Elts);
29300   }
29301 
29302   // If the target doesn't support variable shifts, use either FP conversion
29303   // or integer multiplication to avoid shifting each element individually.
29304   if (VT == MVT::v4i32) {
29305     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29306     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29307                       DAG.getConstant(0x3f800000U, dl, VT));
29308     Amt = DAG.getBitcast(MVT::v4f32, Amt);
29309     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29310   }
29311 
29312   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29313   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29314     SDValue Z = DAG.getConstant(0, dl, VT);
29315     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29316     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29317     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29318     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29319     if (Subtarget.hasSSE41())
29320       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29321     return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29322   }
29323 
29324   return SDValue();
29325 }
29326 
29327 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29328                           SelectionDAG &DAG) {
29329   MVT VT = Op.getSimpleValueType();
29330   SDLoc dl(Op);
29331   SDValue R = Op.getOperand(0);
29332   SDValue Amt = Op.getOperand(1);
29333   unsigned EltSizeInBits = VT.getScalarSizeInBits();
29334   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29335 
29336   unsigned Opc = Op.getOpcode();
29337   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29338   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29339 
29340   assert(VT.isVector() && "Custom lowering only for vector shifts!");
29341   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29342 
29343   if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29344     return V;
29345 
29346   if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29347     return V;
29348 
29349   if (supportedVectorVarShift(VT, Subtarget, Opc))
29350     return Op;
29351 
29352   // i64 vector arithmetic shift can be emulated with the transform:
29353   // M = lshr(SIGN_MASK, Amt)
29354   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29355   if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29356        (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29357       Opc == ISD::SRA) {
29358     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29359     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29360     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29361     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29362     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29363     return R;
29364   }
29365 
29366   // XOP has 128-bit variable logical/arithmetic shifts.
29367   // +ve/-ve Amt = shift left/right.
29368   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29369                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
29370     if (Opc == ISD::SRL || Opc == ISD::SRA) {
29371       SDValue Zero = DAG.getConstant(0, dl, VT);
29372       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29373     }
29374     if (Opc == ISD::SHL || Opc == ISD::SRL)
29375       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29376     if (Opc == ISD::SRA)
29377       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29378   }
29379 
29380   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29381   // shifts per-lane and then shuffle the partial results back together.
29382   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29383     // Splat the shift amounts so the scalar shifts above will catch it.
29384     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29385     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29386     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29387     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29388     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29389   }
29390 
29391   // If possible, lower this shift as a sequence of two shifts by
29392   // constant plus a BLENDing shuffle instead of scalarizing it.
29393   // Example:
29394   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29395   //
29396   // Could be rewritten as:
29397   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29398   //
29399   // The advantage is that the two shifts from the example would be
29400   // lowered as X86ISD::VSRLI nodes in parallel before blending.
29401   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29402                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29403     SDValue Amt1, Amt2;
29404     unsigned NumElts = VT.getVectorNumElements();
29405     SmallVector<int, 8> ShuffleMask;
29406     for (unsigned i = 0; i != NumElts; ++i) {
29407       SDValue A = Amt->getOperand(i);
29408       if (A.isUndef()) {
29409         ShuffleMask.push_back(SM_SentinelUndef);
29410         continue;
29411       }
29412       if (!Amt1 || Amt1 == A) {
29413         ShuffleMask.push_back(i);
29414         Amt1 = A;
29415         continue;
29416       }
29417       if (!Amt2 || Amt2 == A) {
29418         ShuffleMask.push_back(i + NumElts);
29419         Amt2 = A;
29420         continue;
29421       }
29422       break;
29423     }
29424 
29425     // Only perform this blend if we can perform it without loading a mask.
29426     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29427         (VT != MVT::v16i16 ||
29428          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29429         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29430          canWidenShuffleElements(ShuffleMask))) {
29431       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29432       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29433       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29434           Cst2->getAPIntValue().ult(EltSizeInBits)) {
29435         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29436                                                     Cst1->getZExtValue(), DAG);
29437         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29438                                                     Cst2->getZExtValue(), DAG);
29439         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29440       }
29441     }
29442   }
29443 
29444   // If possible, lower this packed shift into a vector multiply instead of
29445   // expanding it into a sequence of scalar shifts.
29446   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29447   if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29448                                                 Subtarget.canExtendTo512BW())))
29449     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29450       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29451 
29452   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29453   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29454   if (Opc == ISD::SRL && ConstantAmt &&
29455       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29456     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29457     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29458     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29459       SDValue Zero = DAG.getConstant(0, dl, VT);
29460       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29461       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29462       return DAG.getSelect(dl, VT, ZAmt, R, Res);
29463     }
29464   }
29465 
29466   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29467   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29468   // TODO: Special case handling for shift by 0/1, really we can afford either
29469   // of these cases in pre-SSE41/XOP/AVX512 but not both.
29470   if (Opc == ISD::SRA && ConstantAmt &&
29471       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29472       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29473         !Subtarget.hasAVX512()) ||
29474        DAG.isKnownNeverZero(Amt))) {
29475     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29476     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29477     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29478       SDValue Amt0 =
29479           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29480       SDValue Amt1 =
29481           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29482       SDValue Sra1 =
29483           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29484       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29485       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29486       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29487     }
29488   }
29489 
29490   // v4i32 Non Uniform Shifts.
29491   // If the shift amount is constant we can shift each lane using the SSE2
29492   // immediate shifts, else we need to zero-extend each lane to the lower i64
29493   // and shift using the SSE2 variable shifts.
29494   // The separate results can then be blended together.
29495   if (VT == MVT::v4i32) {
29496     SDValue Amt0, Amt1, Amt2, Amt3;
29497     if (ConstantAmt) {
29498       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29499       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29500       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29501       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29502     } else {
29503       // The SSE2 shifts use the lower i64 as the same shift amount for
29504       // all lanes and the upper i64 is ignored. On AVX we're better off
29505       // just zero-extending, but for SSE just duplicating the top 16-bits is
29506       // cheaper and has the same effect for out of range values.
29507       if (Subtarget.hasAVX()) {
29508         SDValue Z = DAG.getConstant(0, dl, VT);
29509         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29510         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29511         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29512         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29513       } else {
29514         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29515         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29516                                              {4, 5, 6, 7, -1, -1, -1, -1});
29517         Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29518                                     {0, 1, 1, 1, -1, -1, -1, -1});
29519         Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29520                                     {2, 3, 3, 3, -1, -1, -1, -1});
29521         Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29522                                     {0, 1, 1, 1, -1, -1, -1, -1});
29523         Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29524                                     {2, 3, 3, 3, -1, -1, -1, -1});
29525       }
29526     }
29527 
29528     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29529     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29530     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29531     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29532     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29533 
29534     // Merge the shifted lane results optimally with/without PBLENDW.
29535     // TODO - ideally shuffle combining would handle this.
29536     if (Subtarget.hasSSE41()) {
29537       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29538       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29539       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29540     }
29541     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29542     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29543     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29544   }
29545 
29546   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29547   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29548   // make the existing SSE solution better.
29549   // NOTE: We honor prefered vector width before promoting to 512-bits.
29550   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29551       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29552       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29553       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29554       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29555     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29556            "Unexpected vector type");
29557     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29558     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29559     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29560     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29561     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29562     return DAG.getNode(ISD::TRUNCATE, dl, VT,
29563                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
29564   }
29565 
29566   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29567   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29568   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29569       (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29570        (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29571       !Subtarget.hasXOP()) {
29572     int NumElts = VT.getVectorNumElements();
29573     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29574 
29575     // Extend constant shift amount to vXi16 (it doesn't matter if the type
29576     // isn't legal).
29577     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29578     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29579     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29580     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29581     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
29582            "Constant build vector expected");
29583 
29584     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29585       R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
29586                           : DAG.getZExtOrTrunc(R, dl, ExVT);
29587       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29588       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29589       return DAG.getZExtOrTrunc(R, dl, VT);
29590     }
29591 
29592     SmallVector<SDValue, 16> LoAmt, HiAmt;
29593     for (int i = 0; i != NumElts; i += 16) {
29594       for (int j = 0; j != 8; ++j) {
29595         LoAmt.push_back(Amt.getOperand(i + j));
29596         HiAmt.push_back(Amt.getOperand(i + j + 8));
29597       }
29598     }
29599 
29600     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29601     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29602     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29603 
29604     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29605     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29606     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29607     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29608     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29609     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29610     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29611     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29612     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29613   }
29614 
29615   if (VT == MVT::v16i8 ||
29616       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29617       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29618     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29619 
29620     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29621       if (VT.is512BitVector()) {
29622         // On AVX512BW targets we make use of the fact that VSELECT lowers
29623         // to a masked blend which selects bytes based just on the sign bit
29624         // extracted to a mask.
29625         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29626         V0 = DAG.getBitcast(VT, V0);
29627         V1 = DAG.getBitcast(VT, V1);
29628         Sel = DAG.getBitcast(VT, Sel);
29629         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29630                            ISD::SETGT);
29631         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29632       } else if (Subtarget.hasSSE41()) {
29633         // On SSE41 targets we can use PBLENDVB which selects bytes based just
29634         // on the sign bit.
29635         V0 = DAG.getBitcast(VT, V0);
29636         V1 = DAG.getBitcast(VT, V1);
29637         Sel = DAG.getBitcast(VT, Sel);
29638         return DAG.getBitcast(SelVT,
29639                               DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29640       }
29641       // On pre-SSE41 targets we test for the sign bit by comparing to
29642       // zero - a negative value will set all bits of the lanes to true
29643       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29644       SDValue Z = DAG.getConstant(0, dl, SelVT);
29645       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29646       return DAG.getSelect(dl, SelVT, C, V0, V1);
29647     };
29648 
29649     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29650     // We can safely do this using i16 shifts as we're only interested in
29651     // the 3 lower bits of each byte.
29652     Amt = DAG.getBitcast(ExtVT, Amt);
29653     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29654     Amt = DAG.getBitcast(VT, Amt);
29655 
29656     if (Opc == ISD::SHL || Opc == ISD::SRL) {
29657       // r = VSELECT(r, shift(r, 4), a);
29658       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29659       R = SignBitSelect(VT, Amt, M, R);
29660 
29661       // a += a
29662       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29663 
29664       // r = VSELECT(r, shift(r, 2), a);
29665       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29666       R = SignBitSelect(VT, Amt, M, R);
29667 
29668       // a += a
29669       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29670 
29671       // return VSELECT(r, shift(r, 1), a);
29672       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29673       R = SignBitSelect(VT, Amt, M, R);
29674       return R;
29675     }
29676 
29677     if (Opc == ISD::SRA) {
29678       // For SRA we need to unpack each byte to the higher byte of a i16 vector
29679       // so we can correctly sign extend. We don't care what happens to the
29680       // lower byte.
29681       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29682       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29683       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29684       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29685       ALo = DAG.getBitcast(ExtVT, ALo);
29686       AHi = DAG.getBitcast(ExtVT, AHi);
29687       RLo = DAG.getBitcast(ExtVT, RLo);
29688       RHi = DAG.getBitcast(ExtVT, RHi);
29689 
29690       // r = VSELECT(r, shift(r, 4), a);
29691       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29692       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29693       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29694       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29695 
29696       // a += a
29697       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29698       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29699 
29700       // r = VSELECT(r, shift(r, 2), a);
29701       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29702       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29703       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29704       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29705 
29706       // a += a
29707       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29708       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29709 
29710       // r = VSELECT(r, shift(r, 1), a);
29711       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29712       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29713       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29714       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29715 
29716       // Logical shift the result back to the lower byte, leaving a zero upper
29717       // byte meaning that we can safely pack with PACKUSWB.
29718       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29719       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29720       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29721     }
29722   }
29723 
29724   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29725     MVT ExtVT = MVT::v8i32;
29726     SDValue Z = DAG.getConstant(0, dl, VT);
29727     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29728     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29729     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29730     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29731     ALo = DAG.getBitcast(ExtVT, ALo);
29732     AHi = DAG.getBitcast(ExtVT, AHi);
29733     RLo = DAG.getBitcast(ExtVT, RLo);
29734     RHi = DAG.getBitcast(ExtVT, RHi);
29735     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29736     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29737     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29738     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29739     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29740   }
29741 
29742   if (VT == MVT::v8i16) {
29743     // If we have a constant shift amount, the non-SSE41 path is best as
29744     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29745     bool UseSSE41 = Subtarget.hasSSE41() &&
29746                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29747 
29748     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29749       // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29750       // the sign bit.
29751       if (UseSSE41) {
29752         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29753         V0 = DAG.getBitcast(ExtVT, V0);
29754         V1 = DAG.getBitcast(ExtVT, V1);
29755         Sel = DAG.getBitcast(ExtVT, Sel);
29756         return DAG.getBitcast(
29757             VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29758       }
29759       // On pre-SSE41 targets we splat the sign bit - a negative value will
29760       // set all bits of the lanes to true and VSELECT uses that in
29761       // its OR(AND(V0,C),AND(V1,~C)) lowering.
29762       SDValue C =
29763           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29764       return DAG.getSelect(dl, VT, C, V0, V1);
29765     };
29766 
29767     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29768     if (UseSSE41) {
29769       // On SSE41 targets we need to replicate the shift mask in both
29770       // bytes for PBLENDVB.
29771       Amt = DAG.getNode(
29772           ISD::OR, dl, VT,
29773           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29774           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29775     } else {
29776       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29777     }
29778 
29779     // r = VSELECT(r, shift(r, 8), a);
29780     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29781     R = SignBitSelect(Amt, M, R);
29782 
29783     // a += a
29784     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29785 
29786     // r = VSELECT(r, shift(r, 4), a);
29787     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29788     R = SignBitSelect(Amt, M, R);
29789 
29790     // a += a
29791     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29792 
29793     // r = VSELECT(r, shift(r, 2), a);
29794     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29795     R = SignBitSelect(Amt, M, R);
29796 
29797     // a += a
29798     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29799 
29800     // return VSELECT(r, shift(r, 1), a);
29801     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29802     R = SignBitSelect(Amt, M, R);
29803     return R;
29804   }
29805 
29806   // Decompose 256-bit shifts into 128-bit shifts.
29807   if (VT.is256BitVector())
29808     return splitVectorIntBinary(Op, DAG);
29809 
29810   if (VT == MVT::v32i16 || VT == MVT::v64i8)
29811     return splitVectorIntBinary(Op, DAG);
29812 
29813   return SDValue();
29814 }
29815 
29816 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
29817                                 SelectionDAG &DAG) {
29818   MVT VT = Op.getSimpleValueType();
29819   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29820          "Unexpected funnel shift opcode!");
29821 
29822   SDLoc DL(Op);
29823   SDValue Op0 = Op.getOperand(0);
29824   SDValue Op1 = Op.getOperand(1);
29825   SDValue Amt = Op.getOperand(2);
29826   unsigned EltSizeInBits = VT.getScalarSizeInBits();
29827   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29828 
29829   if (VT.isVector()) {
29830     APInt APIntShiftAmt;
29831     bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29832 
29833     if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29834       if (IsFSHR)
29835         std::swap(Op0, Op1);
29836 
29837       if (IsCstSplat) {
29838         uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29839         SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29840         return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29841                              {Op0, Op1, Imm}, DAG, Subtarget);
29842       }
29843       return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29844                            {Op0, Op1, Amt}, DAG, Subtarget);
29845     }
29846     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29847             VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
29848             VT == MVT::v16i32) &&
29849            "Unexpected funnel shift type!");
29850 
29851     // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29852     // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29853     if (IsCstSplat)
29854       return SDValue();
29855 
29856     SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29857     SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29858     bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29859 
29860     // Constant vXi16 funnel shifts can be efficiently handled by default.
29861     if (IsCst && EltSizeInBits == 16)
29862       return SDValue();
29863 
29864     unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29865     unsigned NumElts = VT.getVectorNumElements();
29866     MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29867     MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29868 
29869     // Split 256-bit integers on XOP/pre-AVX2 targets.
29870     // Split 512-bit integers on non 512-bit BWI targets.
29871     if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 32) ||
29872                                  !Subtarget.hasAVX2())) ||
29873         (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29874          EltSizeInBits < 32)) {
29875       // Pre-mask the amount modulo using the wider vector.
29876       Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29877       return splitVectorOp(Op, DAG);
29878     }
29879 
29880     // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29881     if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29882       if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) {
29883         // Uniform vXi16 funnel shifts can be efficiently handled by default.
29884         if (EltSizeInBits == 16)
29885           return SDValue();
29886 
29887         SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29888         SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29889         ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32);
29890         Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, Subtarget,
29891                                  DAG);
29892         Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, Subtarget,
29893                                  DAG);
29894         return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29895       }
29896     }
29897 
29898     MVT WideSVT = MVT::getIntegerVT(
29899         std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
29900     MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
29901 
29902     // If per-element shifts are legal, fallback to generic expansion.
29903     if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
29904       return SDValue();
29905 
29906     // Attempt to fold as:
29907     // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29908     // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29909     if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29910         supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29911       Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
29912       Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
29913       AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29914       Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
29915                                        EltSizeInBits, DAG);
29916       SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
29917       Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
29918       if (!IsFSHR)
29919         Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
29920                                          EltSizeInBits, DAG);
29921       return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
29922     }
29923 
29924     // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
29925     if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
29926         supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
29927       SDValue Z = DAG.getConstant(0, DL, VT);
29928       SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29929       SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29930       SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29931       SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29932       SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29933       SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29934       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29935     }
29936 
29937     // Fallback to generic expansion.
29938     return SDValue();
29939   }
29940   assert(
29941       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
29942       "Unexpected funnel shift type!");
29943 
29944   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
29945   bool OptForSize = DAG.shouldOptForSize();
29946   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
29947 
29948   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29949   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29950   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
29951       !isa<ConstantSDNode>(Amt)) {
29952     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
29953     SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
29954     Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
29955     Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
29956     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
29957     SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
29958     Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
29959     if (IsFSHR) {
29960       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
29961     } else {
29962       Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
29963       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
29964     }
29965     return DAG.getZExtOrTrunc(Res, DL, VT);
29966   }
29967 
29968   if (VT == MVT::i8 || ExpandFunnel)
29969     return SDValue();
29970 
29971   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
29972   if (VT == MVT::i16) {
29973     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
29974                       DAG.getConstant(15, DL, Amt.getValueType()));
29975     unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
29976     return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
29977   }
29978 
29979   return Op;
29980 }
29981 
29982 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
29983                            SelectionDAG &DAG) {
29984   MVT VT = Op.getSimpleValueType();
29985   assert(VT.isVector() && "Custom lowering only for vector rotates!");
29986 
29987   SDLoc DL(Op);
29988   SDValue R = Op.getOperand(0);
29989   SDValue Amt = Op.getOperand(1);
29990   unsigned Opcode = Op.getOpcode();
29991   unsigned EltSizeInBits = VT.getScalarSizeInBits();
29992   int NumElts = VT.getVectorNumElements();
29993   bool IsROTL = Opcode == ISD::ROTL;
29994 
29995   // Check for constant splat rotation amount.
29996   APInt CstSplatValue;
29997   bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
29998 
29999   // Check for splat rotate by zero.
30000   if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30001     return R;
30002 
30003   // AVX512 implicitly uses modulo rotation amounts.
30004   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
30005     // Attempt to rotate by immediate.
30006     if (IsCstSplat) {
30007       unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30008       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30009       return DAG.getNode(RotOpc, DL, VT, R,
30010                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30011     }
30012 
30013     // Else, fall-back on VPROLV/VPRORV.
30014     return Op;
30015   }
30016 
30017   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30018   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30019     unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30020     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30021   }
30022 
30023   SDValue Z = DAG.getConstant(0, DL, VT);
30024 
30025   if (!IsROTL) {
30026     // If the ISD::ROTR amount is constant, we're always better converting to
30027     // ISD::ROTL.
30028     if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30029       return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30030 
30031     // XOP targets always prefers ISD::ROTL.
30032     if (Subtarget.hasXOP())
30033       return DAG.getNode(ISD::ROTL, DL, VT, R,
30034                          DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30035   }
30036 
30037   // Split 256-bit integers on XOP/pre-AVX2 targets.
30038   if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30039     return splitVectorIntBinary(Op, DAG);
30040 
30041   // XOP has 128-bit vector variable + immediate rotates.
30042   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30043   // XOP implicitly uses modulo rotation amounts.
30044   if (Subtarget.hasXOP()) {
30045     assert(IsROTL && "Only ROTL expected");
30046     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30047 
30048     // Attempt to rotate by immediate.
30049     if (IsCstSplat) {
30050       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30051       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30052                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30053     }
30054 
30055     // Use general rotate by variable (per-element).
30056     return Op;
30057   }
30058 
30059   // Rotate by an uniform constant - expand back to shifts.
30060   if (IsCstSplat)
30061     return SDValue();
30062 
30063   // Split 512-bit integers on non 512-bit BWI targets.
30064   if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30065     return splitVectorIntBinary(Op, DAG);
30066 
30067   assert(
30068       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30069        ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30070         Subtarget.hasAVX2()) ||
30071        ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30072       "Only vXi32/vXi16/vXi8 vector rotates supported");
30073 
30074   MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30075   MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30076 
30077   SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30078   SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30079 
30080   // Attempt to fold as unpack(x,x) << zext(splat(y)):
30081   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30082   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30083   // TODO: Handle vXi16 cases on all targets.
30084   if (EltSizeInBits == 8 || EltSizeInBits == 32 ||
30085       (IsROTL && EltSizeInBits == 16 && !Subtarget.hasAVX())) {
30086     if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) {
30087       unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30088       SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30089       SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30090       BaseRotAmt = DAG.getZExtOrTrunc(BaseRotAmt, DL, MVT::i32);
30091       Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30092                                Subtarget, DAG);
30093       Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30094                                Subtarget, DAG);
30095       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30096     }
30097   }
30098 
30099   // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30100   // the amount bit.
30101   // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30102   if (EltSizeInBits == 8) {
30103     bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30104     MVT WideVT =
30105         MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30106     unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30107 
30108     // Attempt to fold as:
30109     // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30110     // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30111     if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30112         supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30113       // If we're rotating by constant, just use default promotion.
30114       if (IsConstAmt)
30115         return SDValue();
30116       // See if we can perform this by widening to vXi16 or vXi32.
30117       R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30118       R = DAG.getNode(
30119           ISD::OR, DL, WideVT, R,
30120           getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30121       Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30122       R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30123       if (IsROTL)
30124         R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30125       return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30126     }
30127 
30128     // Attempt to fold as unpack(x,x) << zext(y):
30129     // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30130     // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30131     if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30132       // See if we can perform this by unpacking to lo/hi vXi16.
30133       SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30134       SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30135       SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30136       SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30137       SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30138       SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30139       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30140     }
30141     assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type");
30142 
30143     // We don't need ModuloAmt here as we just peek at individual bits.
30144     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30145       if (Subtarget.hasSSE41()) {
30146         // On SSE41 targets we can use PBLENDVB which selects bytes based just
30147         // on the sign bit.
30148         V0 = DAG.getBitcast(VT, V0);
30149         V1 = DAG.getBitcast(VT, V1);
30150         Sel = DAG.getBitcast(VT, Sel);
30151         return DAG.getBitcast(SelVT,
30152                               DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30153       }
30154       // On pre-SSE41 targets we test for the sign bit by comparing to
30155       // zero - a negative value will set all bits of the lanes to true
30156       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30157       SDValue Z = DAG.getConstant(0, DL, SelVT);
30158       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30159       return DAG.getSelect(DL, SelVT, C, V0, V1);
30160     };
30161 
30162     // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30163     if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30164       Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30165       IsROTL = true;
30166     }
30167 
30168     unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30169     unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30170 
30171     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30172     // We can safely do this using i16 shifts as we're only interested in
30173     // the 3 lower bits of each byte.
30174     Amt = DAG.getBitcast(ExtVT, Amt);
30175     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30176     Amt = DAG.getBitcast(VT, Amt);
30177 
30178     // r = VSELECT(r, rot(r, 4), a);
30179     SDValue M;
30180     M = DAG.getNode(
30181         ISD::OR, DL, VT,
30182         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30183         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30184     R = SignBitSelect(VT, Amt, M, R);
30185 
30186     // a += a
30187     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30188 
30189     // r = VSELECT(r, rot(r, 2), a);
30190     M = DAG.getNode(
30191         ISD::OR, DL, VT,
30192         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30193         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30194     R = SignBitSelect(VT, Amt, M, R);
30195 
30196     // a += a
30197     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30198 
30199     // return VSELECT(r, rot(r, 1), a);
30200     M = DAG.getNode(
30201         ISD::OR, DL, VT,
30202         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30203         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30204     return SignBitSelect(VT, Amt, M, R);
30205   }
30206 
30207   bool IsSplatAmt = DAG.isSplatValue(Amt);
30208   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30209   bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30210                         supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30211 
30212   // Fallback for splats + all supported variable shifts.
30213   // Fallback for non-constants AVX2 vXi16 as well.
30214   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30215     Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30216     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30217     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30218     SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30219     SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30220     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30221   }
30222 
30223   // Everything below assumes ISD::ROTL.
30224   if (!IsROTL) {
30225     Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30226     IsROTL = true;
30227   }
30228 
30229   // ISD::ROT* uses modulo rotate amounts.
30230   Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30231 
30232   assert(IsROTL && "Only ROTL supported");
30233 
30234   // As with shifts, attempt to convert the rotation amount to a multiplication
30235   // factor, fallback to general expansion.
30236   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30237   if (!Scale)
30238     return SDValue();
30239 
30240   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30241   if (EltSizeInBits == 16) {
30242     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30243     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30244     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30245   }
30246 
30247   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30248   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30249   // that can then be OR'd with the lower 32-bits.
30250   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30251   static const int OddMask[] = {1, -1, 3, -1};
30252   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30253   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30254 
30255   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30256                               DAG.getBitcast(MVT::v2i64, R),
30257                               DAG.getBitcast(MVT::v2i64, Scale));
30258   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30259                               DAG.getBitcast(MVT::v2i64, R13),
30260                               DAG.getBitcast(MVT::v2i64, Scale13));
30261   Res02 = DAG.getBitcast(VT, Res02);
30262   Res13 = DAG.getBitcast(VT, Res13);
30263 
30264   return DAG.getNode(ISD::OR, DL, VT,
30265                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30266                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30267 }
30268 
30269 /// Returns true if the operand type is exactly twice the native width, and
30270 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30271 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30272 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30273 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30274   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30275 
30276   if (OpWidth == 64)
30277     return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
30278   if (OpWidth == 128)
30279     return Subtarget.hasCmpxchg16b();
30280 
30281   return false;
30282 }
30283 
30284 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30285   Type *MemType = SI->getValueOperand()->getType();
30286 
30287   bool NoImplicitFloatOps =
30288       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30289   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30290       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30291       (Subtarget.hasSSE1() || Subtarget.hasX87()))
30292     return false;
30293 
30294   return needsCmpXchgNb(MemType);
30295 }
30296 
30297 // Note: this turns large loads into lock cmpxchg8b/16b.
30298 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30299 TargetLowering::AtomicExpansionKind
30300 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30301   Type *MemType = LI->getType();
30302 
30303   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30304   // can use movq to do the load. If we have X87 we can load into an 80-bit
30305   // X87 register and store it to a stack temporary.
30306   bool NoImplicitFloatOps =
30307       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30308   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30309       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30310       (Subtarget.hasSSE1() || Subtarget.hasX87()))
30311     return AtomicExpansionKind::None;
30312 
30313   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30314                                  : AtomicExpansionKind::None;
30315 }
30316 
30317 TargetLowering::AtomicExpansionKind
30318 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30319   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30320   Type *MemType = AI->getType();
30321 
30322   // If the operand is too big, we must see if cmpxchg8/16b is available
30323   // and default to library calls otherwise.
30324   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30325     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30326                                    : AtomicExpansionKind::None;
30327   }
30328 
30329   AtomicRMWInst::BinOp Op = AI->getOperation();
30330   switch (Op) {
30331   default:
30332     llvm_unreachable("Unknown atomic operation");
30333   case AtomicRMWInst::Xchg:
30334   case AtomicRMWInst::Add:
30335   case AtomicRMWInst::Sub:
30336     // It's better to use xadd, xsub or xchg for these in all cases.
30337     return AtomicExpansionKind::None;
30338   case AtomicRMWInst::Or:
30339   case AtomicRMWInst::And:
30340   case AtomicRMWInst::Xor:
30341     // If the atomicrmw's result isn't actually used, we can just add a "lock"
30342     // prefix to a normal instruction for these operations.
30343     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
30344                             : AtomicExpansionKind::None;
30345   case AtomicRMWInst::Nand:
30346   case AtomicRMWInst::Max:
30347   case AtomicRMWInst::Min:
30348   case AtomicRMWInst::UMax:
30349   case AtomicRMWInst::UMin:
30350   case AtomicRMWInst::FAdd:
30351   case AtomicRMWInst::FSub:
30352     // These always require a non-trivial set of data operations on x86. We must
30353     // use a cmpxchg loop.
30354     return AtomicExpansionKind::CmpXChg;
30355   }
30356 }
30357 
30358 LoadInst *
30359 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30360   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30361   Type *MemType = AI->getType();
30362   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30363   // there is no benefit in turning such RMWs into loads, and it is actually
30364   // harmful as it introduces a mfence.
30365   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30366     return nullptr;
30367 
30368   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30369   // lowering available in lowerAtomicArith.
30370   // TODO: push more cases through this path.
30371   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30372     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30373         AI->use_empty())
30374       return nullptr;
30375 
30376   IRBuilder<> Builder(AI);
30377   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30378   auto SSID = AI->getSyncScopeID();
30379   // We must restrict the ordering to avoid generating loads with Release or
30380   // ReleaseAcquire orderings.
30381   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
30382 
30383   // Before the load we need a fence. Here is an example lifted from
30384   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30385   // is required:
30386   // Thread 0:
30387   //   x.store(1, relaxed);
30388   //   r1 = y.fetch_add(0, release);
30389   // Thread 1:
30390   //   y.fetch_add(42, acquire);
30391   //   r2 = x.load(relaxed);
30392   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30393   // lowered to just a load without a fence. A mfence flushes the store buffer,
30394   // making the optimization clearly correct.
30395   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30396   // otherwise, we might be able to be more aggressive on relaxed idempotent
30397   // rmw. In practice, they do not look useful, so we don't try to be
30398   // especially clever.
30399   if (SSID == SyncScope::SingleThread)
30400     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
30401     // the IR level, so we must wrap it in an intrinsic.
30402     return nullptr;
30403 
30404   if (!Subtarget.hasMFence())
30405     // FIXME: it might make sense to use a locked operation here but on a
30406     // different cache-line to prevent cache-line bouncing. In practice it
30407     // is probably a small win, and x86 processors without mfence are rare
30408     // enough that we do not bother.
30409     return nullptr;
30410 
30411   Function *MFence =
30412       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30413   Builder.CreateCall(MFence, {});
30414 
30415   // Finally we can emit the atomic load.
30416   LoadInst *Loaded = Builder.CreateAlignedLoad(
30417       AI->getType(), AI->getPointerOperand(), AI->getAlign());
30418   Loaded->setAtomic(Order, SSID);
30419   AI->replaceAllUsesWith(Loaded);
30420   AI->eraseFromParent();
30421   return Loaded;
30422 }
30423 
30424 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
30425   if (!SI.isUnordered())
30426     return false;
30427   return ExperimentalUnorderedISEL;
30428 }
30429 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
30430   if (!LI.isUnordered())
30431     return false;
30432   return ExperimentalUnorderedISEL;
30433 }
30434 
30435 
30436 /// Emit a locked operation on a stack location which does not change any
30437 /// memory location, but does involve a lock prefix.  Location is chosen to be
30438 /// a) very likely accessed only by a single thread to minimize cache traffic,
30439 /// and b) definitely dereferenceable.  Returns the new Chain result.
30440 static SDValue emitLockedStackOp(SelectionDAG &DAG,
30441                                  const X86Subtarget &Subtarget, SDValue Chain,
30442                                  const SDLoc &DL) {
30443   // Implementation notes:
30444   // 1) LOCK prefix creates a full read/write reordering barrier for memory
30445   // operations issued by the current processor.  As such, the location
30446   // referenced is not relevant for the ordering properties of the instruction.
30447   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30448   // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
30449   // 2) Using an immediate operand appears to be the best encoding choice
30450   // here since it doesn't require an extra register.
30451   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30452   // is small enough it might just be measurement noise.)
30453   // 4) When choosing offsets, there are several contributing factors:
30454   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
30455   //      line aligned stack object to improve this case.)
30456   //   b) To minimize our chances of introducing a false dependence, we prefer
30457   //      to offset the stack usage from TOS slightly.
30458   //   c) To minimize concerns about cross thread stack usage - in particular,
30459   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30460   //      captures state in the TOS frame and accesses it from many threads -
30461   //      we want to use an offset such that the offset is in a distinct cache
30462   //      line from the TOS frame.
30463   //
30464   // For a general discussion of the tradeoffs and benchmark results, see:
30465   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30466 
30467   auto &MF = DAG.getMachineFunction();
30468   auto &TFL = *Subtarget.getFrameLowering();
30469   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30470 
30471   if (Subtarget.is64Bit()) {
30472     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30473     SDValue Ops[] = {
30474       DAG.getRegister(X86::RSP, MVT::i64),                  // Base
30475       DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
30476       DAG.getRegister(0, MVT::i64),                         // Index
30477       DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
30478       DAG.getRegister(0, MVT::i16),                         // Segment.
30479       Zero,
30480       Chain};
30481     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30482                                      MVT::Other, Ops);
30483     return SDValue(Res, 1);
30484   }
30485 
30486   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30487   SDValue Ops[] = {
30488     DAG.getRegister(X86::ESP, MVT::i32),            // Base
30489     DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
30490     DAG.getRegister(0, MVT::i32),                   // Index
30491     DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
30492     DAG.getRegister(0, MVT::i16),                   // Segment.
30493     Zero,
30494     Chain
30495   };
30496   SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30497                                    MVT::Other, Ops);
30498   return SDValue(Res, 1);
30499 }
30500 
30501 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
30502                                  SelectionDAG &DAG) {
30503   SDLoc dl(Op);
30504   AtomicOrdering FenceOrdering =
30505       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30506   SyncScope::ID FenceSSID =
30507       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30508 
30509   // The only fence that needs an instruction is a sequentially-consistent
30510   // cross-thread fence.
30511   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30512       FenceSSID == SyncScope::System) {
30513     if (Subtarget.hasMFence())
30514       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30515 
30516     SDValue Chain = Op.getOperand(0);
30517     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30518   }
30519 
30520   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30521   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30522 }
30523 
30524 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
30525                              SelectionDAG &DAG) {
30526   MVT T = Op.getSimpleValueType();
30527   SDLoc DL(Op);
30528   unsigned Reg = 0;
30529   unsigned size = 0;
30530   switch(T.SimpleTy) {
30531   default: llvm_unreachable("Invalid value type!");
30532   case MVT::i8:  Reg = X86::AL;  size = 1; break;
30533   case MVT::i16: Reg = X86::AX;  size = 2; break;
30534   case MVT::i32: Reg = X86::EAX; size = 4; break;
30535   case MVT::i64:
30536     assert(Subtarget.is64Bit() && "Node not type legal!");
30537     Reg = X86::RAX; size = 8;
30538     break;
30539   }
30540   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30541                                   Op.getOperand(2), SDValue());
30542   SDValue Ops[] = { cpIn.getValue(0),
30543                     Op.getOperand(1),
30544                     Op.getOperand(3),
30545                     DAG.getTargetConstant(size, DL, MVT::i8),
30546                     cpIn.getValue(1) };
30547   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30548   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30549   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
30550                                            Ops, T, MMO);
30551 
30552   SDValue cpOut =
30553     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30554   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30555                                       MVT::i32, cpOut.getValue(2));
30556   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30557 
30558   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30559                      cpOut, Success, EFLAGS.getValue(1));
30560 }
30561 
30562 // Create MOVMSKB, taking into account whether we need to split for AVX1.
30563 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
30564                            const X86Subtarget &Subtarget) {
30565   MVT InVT = V.getSimpleValueType();
30566 
30567   if (InVT == MVT::v64i8) {
30568     SDValue Lo, Hi;
30569     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30570     Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30571     Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30572     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30573     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30574     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30575                      DAG.getConstant(32, DL, MVT::i8));
30576     return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30577   }
30578   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30579     SDValue Lo, Hi;
30580     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30581     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30582     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30583     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30584                      DAG.getConstant(16, DL, MVT::i8));
30585     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30586   }
30587 
30588   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30589 }
30590 
30591 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
30592                             SelectionDAG &DAG) {
30593   SDValue Src = Op.getOperand(0);
30594   MVT SrcVT = Src.getSimpleValueType();
30595   MVT DstVT = Op.getSimpleValueType();
30596 
30597   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
30598   // half to v32i1 and concatenating the result.
30599   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
30600     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
30601     assert(Subtarget.hasBWI() && "Expected BWI target");
30602     SDLoc dl(Op);
30603     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
30604                              DAG.getIntPtrConstant(0, dl));
30605     Lo = DAG.getBitcast(MVT::v32i1, Lo);
30606     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
30607                              DAG.getIntPtrConstant(1, dl));
30608     Hi = DAG.getBitcast(MVT::v32i1, Hi);
30609     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
30610   }
30611 
30612   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
30613   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
30614     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
30615     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
30616     SDLoc DL(Op);
30617     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
30618     V = getPMOVMSKB(DL, V, DAG, Subtarget);
30619     return DAG.getZExtOrTrunc(V, DL, DstVT);
30620   }
30621 
30622   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
30623           SrcVT == MVT::i64) && "Unexpected VT!");
30624 
30625   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30626   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30627       !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30628     // This conversion needs to be expanded.
30629     return SDValue();
30630 
30631   SDLoc dl(Op);
30632   if (SrcVT.isVector()) {
30633     // Widen the vector in input in the case of MVT::v2i32.
30634     // Example: from MVT::v2i32 to MVT::v4i32.
30635     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30636                                  SrcVT.getVectorNumElements() * 2);
30637     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30638                       DAG.getUNDEF(SrcVT));
30639   } else {
30640     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
30641            "Unexpected source type in LowerBITCAST");
30642     Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30643   }
30644 
30645   MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30646   Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30647 
30648   if (DstVT == MVT::x86mmx)
30649     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30650 
30651   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30652                      DAG.getIntPtrConstant(0, dl));
30653 }
30654 
30655 /// Compute the horizontal sum of bytes in V for the elements of VT.
30656 ///
30657 /// Requires V to be a byte vector and VT to be an integer vector type with
30658 /// wider elements than V's type. The width of the elements of VT determines
30659 /// how many bytes of V are summed horizontally to produce each element of the
30660 /// result.
30661 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30662                                       const X86Subtarget &Subtarget,
30663                                       SelectionDAG &DAG) {
30664   SDLoc DL(V);
30665   MVT ByteVecVT = V.getSimpleValueType();
30666   MVT EltVT = VT.getVectorElementType();
30667   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
30668          "Expected value to have byte element type.");
30669   assert(EltVT != MVT::i8 &&
30670          "Horizontal byte sum only makes sense for wider elements!");
30671   unsigned VecSize = VT.getSizeInBits();
30672   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
30673 
30674   // PSADBW instruction horizontally add all bytes and leave the result in i64
30675   // chunks, thus directly computes the pop count for v2i64 and v4i64.
30676   if (EltVT == MVT::i64) {
30677     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30678     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30679     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30680     return DAG.getBitcast(VT, V);
30681   }
30682 
30683   if (EltVT == MVT::i32) {
30684     // We unpack the low half and high half into i32s interleaved with zeros so
30685     // that we can use PSADBW to horizontally sum them. The most useful part of
30686     // this is that it lines up the results of two PSADBW instructions to be
30687     // two v2i64 vectors which concatenated are the 4 population counts. We can
30688     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30689     SDValue Zeros = DAG.getConstant(0, DL, VT);
30690     SDValue V32 = DAG.getBitcast(VT, V);
30691     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30692     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30693 
30694     // Do the horizontal sums into two v2i64s.
30695     Zeros = DAG.getConstant(0, DL, ByteVecVT);
30696     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30697     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30698                       DAG.getBitcast(ByteVecVT, Low), Zeros);
30699     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30700                        DAG.getBitcast(ByteVecVT, High), Zeros);
30701 
30702     // Merge them together.
30703     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30704     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30705                     DAG.getBitcast(ShortVecVT, Low),
30706                     DAG.getBitcast(ShortVecVT, High));
30707 
30708     return DAG.getBitcast(VT, V);
30709   }
30710 
30711   // The only element type left is i16.
30712   assert(EltVT == MVT::i16 && "Unknown how to handle type");
30713 
30714   // To obtain pop count for each i16 element starting from the pop count for
30715   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30716   // right by 8. It is important to shift as i16s as i8 vector shift isn't
30717   // directly supported.
30718   SDValue ShifterV = DAG.getConstant(8, DL, VT);
30719   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30720   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30721                   DAG.getBitcast(ByteVecVT, V));
30722   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30723 }
30724 
30725 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30726                                         const X86Subtarget &Subtarget,
30727                                         SelectionDAG &DAG) {
30728   MVT VT = Op.getSimpleValueType();
30729   MVT EltVT = VT.getVectorElementType();
30730   int NumElts = VT.getVectorNumElements();
30731   (void)EltVT;
30732   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
30733 
30734   // Implement a lookup table in register by using an algorithm based on:
30735   // http://wm.ite.pl/articles/sse-popcount.html
30736   //
30737   // The general idea is that every lower byte nibble in the input vector is an
30738   // index into a in-register pre-computed pop count table. We then split up the
30739   // input vector in two new ones: (1) a vector with only the shifted-right
30740   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30741   // masked out higher ones) for each byte. PSHUFB is used separately with both
30742   // to index the in-register table. Next, both are added and the result is a
30743   // i8 vector where each element contains the pop count for input byte.
30744   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30745                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30746                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30747                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30748 
30749   SmallVector<SDValue, 64> LUTVec;
30750   for (int i = 0; i < NumElts; ++i)
30751     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30752   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30753   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30754 
30755   // High nibbles
30756   SDValue FourV = DAG.getConstant(4, DL, VT);
30757   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30758 
30759   // Low nibbles
30760   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30761 
30762   // The input vector is used as the shuffle mask that index elements into the
30763   // LUT. After counting low and high nibbles, add the vector to obtain the
30764   // final pop count per i8 element.
30765   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30766   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30767   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
30768 }
30769 
30770 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
30771 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
30772 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30773                                 SelectionDAG &DAG) {
30774   MVT VT = Op.getSimpleValueType();
30775   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
30776          "Unknown CTPOP type to handle");
30777   SDLoc DL(Op.getNode());
30778   SDValue Op0 = Op.getOperand(0);
30779 
30780   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
30781   if (Subtarget.hasVPOPCNTDQ()) {
30782     unsigned NumElems = VT.getVectorNumElements();
30783     assert((VT.getVectorElementType() == MVT::i8 ||
30784             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
30785     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
30786       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
30787       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
30788       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
30789       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
30790     }
30791   }
30792 
30793   // Decompose 256-bit ops into smaller 128-bit ops.
30794   if (VT.is256BitVector() && !Subtarget.hasInt256())
30795     return splitVectorIntUnary(Op, DAG);
30796 
30797   // Decompose 512-bit ops into smaller 256-bit ops.
30798   if (VT.is512BitVector() && !Subtarget.hasBWI())
30799     return splitVectorIntUnary(Op, DAG);
30800 
30801   // For element types greater than i8, do vXi8 pop counts and a bytesum.
30802   if (VT.getScalarType() != MVT::i8) {
30803     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
30804     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
30805     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
30806     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
30807   }
30808 
30809   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
30810   if (!Subtarget.hasSSSE3())
30811     return SDValue();
30812 
30813   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
30814 }
30815 
30816 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30817                           SelectionDAG &DAG) {
30818   assert(Op.getSimpleValueType().isVector() &&
30819          "We only do custom lowering for vector population count.");
30820   return LowerVectorCTPOP(Op, Subtarget, DAG);
30821 }
30822 
30823 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
30824   MVT VT = Op.getSimpleValueType();
30825   SDValue In = Op.getOperand(0);
30826   SDLoc DL(Op);
30827 
30828   // For scalars, its still beneficial to transfer to/from the SIMD unit to
30829   // perform the BITREVERSE.
30830   if (!VT.isVector()) {
30831     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
30832     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
30833     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
30834     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
30835                        DAG.getIntPtrConstant(0, DL));
30836   }
30837 
30838   int NumElts = VT.getVectorNumElements();
30839   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
30840 
30841   // Decompose 256-bit ops into smaller 128-bit ops.
30842   if (VT.is256BitVector())
30843     return splitVectorIntUnary(Op, DAG);
30844 
30845   assert(VT.is128BitVector() &&
30846          "Only 128-bit vector bitreverse lowering supported.");
30847 
30848   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
30849   // perform the BSWAP in the shuffle.
30850   // Its best to shuffle using the second operand as this will implicitly allow
30851   // memory folding for multiple vectors.
30852   SmallVector<SDValue, 16> MaskElts;
30853   for (int i = 0; i != NumElts; ++i) {
30854     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
30855       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
30856       int PermuteByte = SourceByte | (2 << 5);
30857       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
30858     }
30859   }
30860 
30861   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
30862   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
30863   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
30864                     Res, Mask);
30865   return DAG.getBitcast(VT, Res);
30866 }
30867 
30868 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
30869                                SelectionDAG &DAG) {
30870   MVT VT = Op.getSimpleValueType();
30871 
30872   if (Subtarget.hasXOP() && !VT.is512BitVector())
30873     return LowerBITREVERSE_XOP(Op, DAG);
30874 
30875   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
30876 
30877   SDValue In = Op.getOperand(0);
30878   SDLoc DL(Op);
30879 
30880   assert(VT.getScalarType() == MVT::i8 &&
30881          "Only byte vector BITREVERSE supported");
30882 
30883   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
30884   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
30885     return splitVectorIntUnary(Op, DAG);
30886 
30887   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
30888   if (VT == MVT::v32i8 && !Subtarget.hasInt256())
30889     return splitVectorIntUnary(Op, DAG);
30890 
30891   unsigned NumElts = VT.getVectorNumElements();
30892 
30893   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
30894   if (Subtarget.hasGFNI()) {
30895     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
30896     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
30897     Matrix = DAG.getBitcast(VT, Matrix);
30898     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
30899                        DAG.getTargetConstant(0, DL, MVT::i8));
30900   }
30901 
30902   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
30903   // two nibbles and a PSHUFB lookup to find the bitreverse of each
30904   // 0-15 value (moved to the other nibble).
30905   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
30906   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
30907   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
30908 
30909   const int LoLUT[16] = {
30910       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
30911       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
30912       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
30913       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
30914   const int HiLUT[16] = {
30915       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
30916       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
30917       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
30918       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
30919 
30920   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
30921   for (unsigned i = 0; i < NumElts; ++i) {
30922     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
30923     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
30924   }
30925 
30926   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
30927   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
30928   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
30929   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
30930   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30931 }
30932 
30933 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
30934                            SelectionDAG &DAG) {
30935   SDLoc DL(Op);
30936   SDValue X = Op.getOperand(0);
30937   MVT VT = Op.getSimpleValueType();
30938 
30939   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
30940   if (VT == MVT::i8 ||
30941       DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
30942     X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
30943     SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
30944                                 DAG.getConstant(0, DL, MVT::i8));
30945     // Copy the inverse of the parity flag into a register with setcc.
30946     SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
30947     // Extend to the original type.
30948     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
30949   }
30950 
30951   // If we have POPCNT, use the default expansion.
30952   if (Subtarget.hasPOPCNT())
30953     return SDValue();
30954 
30955   if (VT == MVT::i64) {
30956     // Xor the high and low 16-bits together using a 32-bit operation.
30957     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
30958                              DAG.getNode(ISD::SRL, DL, MVT::i64, X,
30959                                          DAG.getConstant(32, DL, MVT::i8)));
30960     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
30961     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
30962   }
30963 
30964   if (VT != MVT::i16) {
30965     // Xor the high and low 16-bits together using a 32-bit operation.
30966     SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
30967                                DAG.getConstant(16, DL, MVT::i8));
30968     X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
30969   } else {
30970     // If the input is 16-bits, we need to extend to use an i32 shift below.
30971     X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
30972   }
30973 
30974   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
30975   // This should allow an h-reg to be used to save a shift.
30976   SDValue Hi = DAG.getNode(
30977       ISD::TRUNCATE, DL, MVT::i8,
30978       DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
30979   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
30980   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
30981   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
30982 
30983   // Copy the inverse of the parity flag into a register with setcc.
30984   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
30985   // Extend to the original type.
30986   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
30987 }
30988 
30989 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
30990                                         const X86Subtarget &Subtarget) {
30991   unsigned NewOpc = 0;
30992   switch (N->getOpcode()) {
30993   case ISD::ATOMIC_LOAD_ADD:
30994     NewOpc = X86ISD::LADD;
30995     break;
30996   case ISD::ATOMIC_LOAD_SUB:
30997     NewOpc = X86ISD::LSUB;
30998     break;
30999   case ISD::ATOMIC_LOAD_OR:
31000     NewOpc = X86ISD::LOR;
31001     break;
31002   case ISD::ATOMIC_LOAD_XOR:
31003     NewOpc = X86ISD::LXOR;
31004     break;
31005   case ISD::ATOMIC_LOAD_AND:
31006     NewOpc = X86ISD::LAND;
31007     break;
31008   default:
31009     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31010   }
31011 
31012   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31013 
31014   return DAG.getMemIntrinsicNode(
31015       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31016       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31017       /*MemVT=*/N->getSimpleValueType(0), MMO);
31018 }
31019 
31020 /// Lower atomic_load_ops into LOCK-prefixed operations.
31021 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
31022                                 const X86Subtarget &Subtarget) {
31023   AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31024   SDValue Chain = N->getOperand(0);
31025   SDValue LHS = N->getOperand(1);
31026   SDValue RHS = N->getOperand(2);
31027   unsigned Opc = N->getOpcode();
31028   MVT VT = N->getSimpleValueType(0);
31029   SDLoc DL(N);
31030 
31031   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31032   // can only be lowered when the result is unused.  They should have already
31033   // been transformed into a cmpxchg loop in AtomicExpand.
31034   if (N->hasAnyUseOfValue(0)) {
31035     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31036     // select LXADD if LOCK_SUB can't be selected.
31037     if (Opc == ISD::ATOMIC_LOAD_SUB) {
31038       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
31039       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31040                            RHS, AN->getMemOperand());
31041     }
31042     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
31043            "Used AtomicRMW ops other than Add should have been expanded!");
31044     return N;
31045   }
31046 
31047   // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31048   // The core idea here is that since the memory location isn't actually
31049   // changing, all we need is a lowering for the *ordering* impacts of the
31050   // atomicrmw.  As such, we can chose a different operation and memory
31051   // location to minimize impact on other code.
31052   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
31053     // On X86, the only ordering which actually requires an instruction is
31054     // seq_cst which isn't SingleThread, everything just needs to be preserved
31055     // during codegen and then dropped. Note that we expect (but don't assume),
31056     // that orderings other than seq_cst and acq_rel have been canonicalized to
31057     // a store or load.
31058     if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
31059         AN->getSyncScopeID() == SyncScope::System) {
31060       // Prefer a locked operation against a stack location to minimize cache
31061       // traffic.  This assumes that stack locations are very likely to be
31062       // accessed only by the owning thread.
31063       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31064       assert(!N->hasAnyUseOfValue(0));
31065       // NOTE: The getUNDEF is needed to give something for the unused result 0.
31066       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31067                          DAG.getUNDEF(VT), NewChain);
31068     }
31069     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31070     SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
31071     assert(!N->hasAnyUseOfValue(0));
31072     // NOTE: The getUNDEF is needed to give something for the unused result 0.
31073     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31074                        DAG.getUNDEF(VT), NewChain);
31075   }
31076 
31077   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31078   // RAUW the chain, but don't worry about the result, as it's unused.
31079   assert(!N->hasAnyUseOfValue(0));
31080   // NOTE: The getUNDEF is needed to give something for the unused result 0.
31081   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31082                      DAG.getUNDEF(VT), LockOp.getValue(1));
31083 }
31084 
31085 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
31086                                  const X86Subtarget &Subtarget) {
31087   auto *Node = cast<AtomicSDNode>(Op.getNode());
31088   SDLoc dl(Node);
31089   EVT VT = Node->getMemoryVT();
31090 
31091   bool IsSeqCst =
31092       Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31093   bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31094 
31095   // If this store is not sequentially consistent and the type is legal
31096   // we can just keep it.
31097   if (!IsSeqCst && IsTypeLegal)
31098     return Op;
31099 
31100   if (VT == MVT::i64 && !IsTypeLegal) {
31101     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31102     // is enabled.
31103     bool NoImplicitFloatOps =
31104         DAG.getMachineFunction().getFunction().hasFnAttribute(
31105             Attribute::NoImplicitFloat);
31106     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31107       SDValue Chain;
31108       if (Subtarget.hasSSE1()) {
31109         SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
31110                                        Node->getOperand(2));
31111         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31112         SclToVec = DAG.getBitcast(StVT, SclToVec);
31113         SDVTList Tys = DAG.getVTList(MVT::Other);
31114         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31115         Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31116                                         MVT::i64, Node->getMemOperand());
31117       } else if (Subtarget.hasX87()) {
31118         // First load this into an 80-bit X87 register using a stack temporary.
31119         // This will put the whole integer into the significand.
31120         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31121         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31122         MachinePointerInfo MPI =
31123             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31124         Chain =
31125             DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
31126                          MPI, MaybeAlign(), MachineMemOperand::MOStore);
31127         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31128         SDValue LdOps[] = {Chain, StackPtr};
31129         SDValue Value =
31130             DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31131                                     /*Align*/ None, MachineMemOperand::MOLoad);
31132         Chain = Value.getValue(1);
31133 
31134         // Now use an FIST to do the atomic store.
31135         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31136         Chain =
31137             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31138                                     StoreOps, MVT::i64, Node->getMemOperand());
31139       }
31140 
31141       if (Chain) {
31142         // If this is a sequentially consistent store, also emit an appropriate
31143         // barrier.
31144         if (IsSeqCst)
31145           Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31146 
31147         return Chain;
31148       }
31149     }
31150   }
31151 
31152   // Convert seq_cst store -> xchg
31153   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31154   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31155   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
31156                                Node->getMemoryVT(),
31157                                Node->getOperand(0),
31158                                Node->getOperand(1), Node->getOperand(2),
31159                                Node->getMemOperand());
31160   return Swap.getValue(1);
31161 }
31162 
31163 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
31164   SDNode *N = Op.getNode();
31165   MVT VT = N->getSimpleValueType(0);
31166   unsigned Opc = Op.getOpcode();
31167 
31168   // Let legalize expand this if it isn't a legal type yet.
31169   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31170     return SDValue();
31171 
31172   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31173   SDLoc DL(N);
31174 
31175   // Set the carry flag.
31176   SDValue Carry = Op.getOperand(2);
31177   EVT CarryVT = Carry.getValueType();
31178   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31179                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
31180 
31181   bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
31182   SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31183                             Op.getOperand(0), Op.getOperand(1),
31184                             Carry.getValue(1));
31185 
31186   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31187   SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31188                            Sum.getValue(1), DL, DAG);
31189   if (N->getValueType(1) == MVT::i1)
31190     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31191 
31192   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31193 }
31194 
31195 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31196                             SelectionDAG &DAG) {
31197   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31198 
31199   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31200   // which returns the values as { float, float } (in XMM0) or
31201   // { double, double } (which is returned in XMM0, XMM1).
31202   SDLoc dl(Op);
31203   SDValue Arg = Op.getOperand(0);
31204   EVT ArgVT = Arg.getValueType();
31205   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31206 
31207   TargetLowering::ArgListTy Args;
31208   TargetLowering::ArgListEntry Entry;
31209 
31210   Entry.Node = Arg;
31211   Entry.Ty = ArgTy;
31212   Entry.IsSExt = false;
31213   Entry.IsZExt = false;
31214   Args.push_back(Entry);
31215 
31216   bool isF64 = ArgVT == MVT::f64;
31217   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31218   // the small struct {f32, f32} is returned in (eax, edx). For f64,
31219   // the results are returned via SRet in memory.
31220   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31221   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31222   const char *LibcallName = TLI.getLibcallName(LC);
31223   SDValue Callee =
31224       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31225 
31226   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31227                       : (Type *)FixedVectorType::get(ArgTy, 4);
31228 
31229   TargetLowering::CallLoweringInfo CLI(DAG);
31230   CLI.setDebugLoc(dl)
31231       .setChain(DAG.getEntryNode())
31232       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31233 
31234   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31235 
31236   if (isF64)
31237     // Returned in xmm0 and xmm1.
31238     return CallResult.first;
31239 
31240   // Returned in bits 0:31 and 32:64 xmm0.
31241   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31242                                CallResult.first, DAG.getIntPtrConstant(0, dl));
31243   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31244                                CallResult.first, DAG.getIntPtrConstant(1, dl));
31245   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31246   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31247 }
31248 
31249 /// Widen a vector input to a vector of NVT.  The
31250 /// input vector must have the same element type as NVT.
31251 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
31252                             bool FillWithZeroes = false) {
31253   // Check if InOp already has the right width.
31254   MVT InVT = InOp.getSimpleValueType();
31255   if (InVT == NVT)
31256     return InOp;
31257 
31258   if (InOp.isUndef())
31259     return DAG.getUNDEF(NVT);
31260 
31261   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
31262          "input and widen element type must match");
31263 
31264   unsigned InNumElts = InVT.getVectorNumElements();
31265   unsigned WidenNumElts = NVT.getVectorNumElements();
31266   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31267          "Unexpected request for vector widening");
31268 
31269   SDLoc dl(InOp);
31270   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31271       InOp.getNumOperands() == 2) {
31272     SDValue N1 = InOp.getOperand(1);
31273     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31274         N1.isUndef()) {
31275       InOp = InOp.getOperand(0);
31276       InVT = InOp.getSimpleValueType();
31277       InNumElts = InVT.getVectorNumElements();
31278     }
31279   }
31280   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
31281       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
31282     SmallVector<SDValue, 16> Ops;
31283     for (unsigned i = 0; i < InNumElts; ++i)
31284       Ops.push_back(InOp.getOperand(i));
31285 
31286     EVT EltVT = InOp.getOperand(0).getValueType();
31287 
31288     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31289       DAG.getUNDEF(EltVT);
31290     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31291       Ops.push_back(FillVal);
31292     return DAG.getBuildVector(NVT, dl, Ops);
31293   }
31294   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31295     DAG.getUNDEF(NVT);
31296   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31297                      InOp, DAG.getIntPtrConstant(0, dl));
31298 }
31299 
31300 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
31301                              SelectionDAG &DAG) {
31302   assert(Subtarget.hasAVX512() &&
31303          "MGATHER/MSCATTER are supported on AVX-512 arch only");
31304 
31305   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31306   SDValue Src = N->getValue();
31307   MVT VT = Src.getSimpleValueType();
31308   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31309   SDLoc dl(Op);
31310 
31311   SDValue Scale = N->getScale();
31312   SDValue Index = N->getIndex();
31313   SDValue Mask = N->getMask();
31314   SDValue Chain = N->getChain();
31315   SDValue BasePtr = N->getBasePtr();
31316 
31317   if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31318     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31319     // If the index is v2i64 and we have VLX we can use xmm for data and index.
31320     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31321       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31322       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31323       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31324       SDVTList VTs = DAG.getVTList(MVT::Other);
31325       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31326       return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31327                                      N->getMemoryVT(), N->getMemOperand());
31328     }
31329     return SDValue();
31330   }
31331 
31332   MVT IndexVT = Index.getSimpleValueType();
31333 
31334   // If the index is v2i32, we're being called by type legalization and we
31335   // should just let the default handling take care of it.
31336   if (IndexVT == MVT::v2i32)
31337     return SDValue();
31338 
31339   // If we don't have VLX and neither the passthru or index is 512-bits, we
31340   // need to widen until one is.
31341   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31342       !Index.getSimpleValueType().is512BitVector()) {
31343     // Determine how much we need to widen by to get a 512-bit type.
31344     unsigned Factor = std::min(512/VT.getSizeInBits(),
31345                                512/IndexVT.getSizeInBits());
31346     unsigned NumElts = VT.getVectorNumElements() * Factor;
31347 
31348     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31349     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31350     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31351 
31352     Src = ExtendToType(Src, VT, DAG);
31353     Index = ExtendToType(Index, IndexVT, DAG);
31354     Mask = ExtendToType(Mask, MaskVT, DAG, true);
31355   }
31356 
31357   SDVTList VTs = DAG.getVTList(MVT::Other);
31358   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31359   return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31360                                  N->getMemoryVT(), N->getMemOperand());
31361 }
31362 
31363 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31364                           SelectionDAG &DAG) {
31365 
31366   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31367   MVT VT = Op.getSimpleValueType();
31368   MVT ScalarVT = VT.getScalarType();
31369   SDValue Mask = N->getMask();
31370   MVT MaskVT = Mask.getSimpleValueType();
31371   SDValue PassThru = N->getPassThru();
31372   SDLoc dl(Op);
31373 
31374   // Handle AVX masked loads which don't support passthru other than 0.
31375   if (MaskVT.getVectorElementType() != MVT::i1) {
31376     // We also allow undef in the isel pattern.
31377     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31378       return Op;
31379 
31380     SDValue NewLoad = DAG.getMaskedLoad(
31381         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31382         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31383         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31384         N->isExpandingLoad());
31385     // Emit a blend.
31386     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31387     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31388   }
31389 
31390   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31391          "Expanding masked load is supported on AVX-512 target only!");
31392 
31393   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31394          "Expanding masked load is supported for 32 and 64-bit types only!");
31395 
31396   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31397          "Cannot lower masked load op.");
31398 
31399   assert((ScalarVT.getSizeInBits() >= 32 ||
31400           (Subtarget.hasBWI() &&
31401               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31402          "Unsupported masked load op.");
31403 
31404   // This operation is legal for targets with VLX, but without
31405   // VLX the vector should be widened to 512 bit
31406   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31407   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31408   PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31409 
31410   // Mask element has to be i1.
31411   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31412          "Unexpected mask type");
31413 
31414   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31415 
31416   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31417   SDValue NewLoad = DAG.getMaskedLoad(
31418       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31419       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31420       N->getExtensionType(), N->isExpandingLoad());
31421 
31422   SDValue Extract =
31423       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31424                   DAG.getIntPtrConstant(0, dl));
31425   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31426   return DAG.getMergeValues(RetOps, dl);
31427 }
31428 
31429 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31430                            SelectionDAG &DAG) {
31431   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31432   SDValue DataToStore = N->getValue();
31433   MVT VT = DataToStore.getSimpleValueType();
31434   MVT ScalarVT = VT.getScalarType();
31435   SDValue Mask = N->getMask();
31436   SDLoc dl(Op);
31437 
31438   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
31439          "Expanding masked load is supported on AVX-512 target only!");
31440 
31441   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
31442          "Expanding masked load is supported for 32 and 64-bit types only!");
31443 
31444   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31445          "Cannot lower masked store op.");
31446 
31447   assert((ScalarVT.getSizeInBits() >= 32 ||
31448           (Subtarget.hasBWI() &&
31449               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31450           "Unsupported masked store op.");
31451 
31452   // This operation is legal for targets with VLX, but without
31453   // VLX the vector should be widened to 512 bit
31454   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31455   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31456 
31457   // Mask element has to be i1.
31458   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31459          "Unexpected mask type");
31460 
31461   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31462 
31463   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31464   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31465   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31466                             N->getOffset(), Mask, N->getMemoryVT(),
31467                             N->getMemOperand(), N->getAddressingMode(),
31468                             N->isTruncatingStore(), N->isCompressingStore());
31469 }
31470 
31471 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31472                             SelectionDAG &DAG) {
31473   assert(Subtarget.hasAVX2() &&
31474          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
31475 
31476   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31477   SDLoc dl(Op);
31478   MVT VT = Op.getSimpleValueType();
31479   SDValue Index = N->getIndex();
31480   SDValue Mask = N->getMask();
31481   SDValue PassThru = N->getPassThru();
31482   MVT IndexVT = Index.getSimpleValueType();
31483 
31484   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
31485 
31486   // If the index is v2i32, we're being called by type legalization.
31487   if (IndexVT == MVT::v2i32)
31488     return SDValue();
31489 
31490   // If we don't have VLX and neither the passthru or index is 512-bits, we
31491   // need to widen until one is.
31492   MVT OrigVT = VT;
31493   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31494       !IndexVT.is512BitVector()) {
31495     // Determine how much we need to widen by to get a 512-bit type.
31496     unsigned Factor = std::min(512/VT.getSizeInBits(),
31497                                512/IndexVT.getSizeInBits());
31498 
31499     unsigned NumElts = VT.getVectorNumElements() * Factor;
31500 
31501     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31502     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31503     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31504 
31505     PassThru = ExtendToType(PassThru, VT, DAG);
31506     Index = ExtendToType(Index, IndexVT, DAG);
31507     Mask = ExtendToType(Mask, MaskVT, DAG, true);
31508   }
31509 
31510   // Break dependency on the data register.
31511   if (PassThru.isUndef())
31512     PassThru = getZeroVector(VT, Subtarget, DAG, dl);
31513 
31514   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
31515                     N->getScale() };
31516   SDValue NewGather = DAG.getMemIntrinsicNode(
31517       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
31518       N->getMemOperand());
31519   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
31520                                 NewGather, DAG.getIntPtrConstant(0, dl));
31521   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
31522 }
31523 
31524 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
31525   SDLoc dl(Op);
31526   SDValue Src = Op.getOperand(0);
31527   MVT DstVT = Op.getSimpleValueType();
31528 
31529   AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
31530   unsigned SrcAS = N->getSrcAddressSpace();
31531 
31532   assert(SrcAS != N->getDestAddressSpace() &&
31533          "addrspacecast must be between different address spaces");
31534 
31535   if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
31536     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
31537   } else if (DstVT == MVT::i64) {
31538     Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
31539   } else if (DstVT == MVT::i32) {
31540     Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
31541   } else {
31542     report_fatal_error("Bad address space in addrspacecast");
31543   }
31544   return Op;
31545 }
31546 
31547 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
31548                                               SelectionDAG &DAG) const {
31549   // TODO: Eventually, the lowering of these nodes should be informed by or
31550   // deferred to the GC strategy for the function in which they appear. For
31551   // now, however, they must be lowered to something. Since they are logically
31552   // no-ops in the case of a null GC strategy (or a GC strategy which does not
31553   // require special handling for these nodes), lower them as literal NOOPs for
31554   // the time being.
31555   SmallVector<SDValue, 2> Ops;
31556 
31557   Ops.push_back(Op.getOperand(0));
31558   if (Op->getGluedNode())
31559     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
31560 
31561   SDLoc OpDL(Op);
31562   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
31563   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
31564 
31565   return NOOP;
31566 }
31567 
31568 // Custom split CVTPS2PH with wide types.
31569 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
31570   SDLoc dl(Op);
31571   EVT VT = Op.getValueType();
31572   SDValue Lo, Hi;
31573   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
31574   EVT LoVT, HiVT;
31575   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31576   SDValue RC = Op.getOperand(1);
31577   Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
31578   Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
31579   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31580 }
31581 
31582 /// Provide custom lowering hooks for some operations.
31583 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
31584   switch (Op.getOpcode()) {
31585   default: llvm_unreachable("Should not custom lower this!");
31586   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
31587   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
31588     return LowerCMP_SWAP(Op, Subtarget, DAG);
31589   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
31590   case ISD::ATOMIC_LOAD_ADD:
31591   case ISD::ATOMIC_LOAD_SUB:
31592   case ISD::ATOMIC_LOAD_OR:
31593   case ISD::ATOMIC_LOAD_XOR:
31594   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
31595   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
31596   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
31597   case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
31598   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
31599   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
31600   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
31601   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
31602   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
31603   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
31604   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
31605   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
31606   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
31607   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
31608   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
31609   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
31610   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
31611   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
31612   case ISD::SHL_PARTS:
31613   case ISD::SRA_PARTS:
31614   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
31615   case ISD::FSHL:
31616   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
31617   case ISD::STRICT_SINT_TO_FP:
31618   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
31619   case ISD::STRICT_UINT_TO_FP:
31620   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
31621   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
31622   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
31623   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
31624   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
31625   case ISD::ZERO_EXTEND_VECTOR_INREG:
31626   case ISD::SIGN_EXTEND_VECTOR_INREG:
31627     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
31628   case ISD::FP_TO_SINT:
31629   case ISD::STRICT_FP_TO_SINT:
31630   case ISD::FP_TO_UINT:
31631   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
31632   case ISD::FP_TO_SINT_SAT:
31633   case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
31634   case ISD::FP_EXTEND:
31635   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
31636   case ISD::FP_ROUND:
31637   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
31638   case ISD::FP16_TO_FP:
31639   case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
31640   case ISD::FP_TO_FP16:
31641   case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
31642   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
31643   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
31644   case ISD::FADD:
31645   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
31646   case ISD::FROUND:             return LowerFROUND(Op, DAG);
31647   case ISD::FABS:
31648   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
31649   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
31650   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
31651   case ISD::LRINT:
31652   case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
31653   case ISD::SETCC:
31654   case ISD::STRICT_FSETCC:
31655   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
31656   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
31657   case ISD::SELECT:             return LowerSELECT(Op, DAG);
31658   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
31659   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
31660   case ISD::VASTART:            return LowerVASTART(Op, DAG);
31661   case ISD::VAARG:              return LowerVAARG(Op, DAG);
31662   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
31663   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31664   case ISD::INTRINSIC_VOID:
31665   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31666   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
31667   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
31668   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
31669   case ISD::FRAME_TO_ARGS_OFFSET:
31670                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31671   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31672   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
31673   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
31674   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
31675   case ISD::EH_SJLJ_SETUP_DISPATCH:
31676     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31677   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
31678   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
31679   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
31680   case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
31681   case ISD::CTLZ:
31682   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
31683   case ISD::CTTZ:
31684   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
31685   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
31686   case ISD::MULHS:
31687   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
31688   case ISD::ROTL:
31689   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
31690   case ISD::SRA:
31691   case ISD::SRL:
31692   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
31693   case ISD::SADDO:
31694   case ISD::UADDO:
31695   case ISD::SSUBO:
31696   case ISD::USUBO:              return LowerXALUO(Op, DAG);
31697   case ISD::SMULO:
31698   case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
31699   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31700   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
31701   case ISD::SADDO_CARRY:
31702   case ISD::SSUBO_CARRY:
31703   case ISD::ADDCARRY:
31704   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
31705   case ISD::ADD:
31706   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
31707   case ISD::UADDSAT:
31708   case ISD::SADDSAT:
31709   case ISD::USUBSAT:
31710   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31711   case ISD::SMAX:
31712   case ISD::SMIN:
31713   case ISD::UMAX:
31714   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
31715   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
31716   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
31717   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
31718   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
31719   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
31720   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
31721   case ISD::GC_TRANSITION_START:
31722   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
31723   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
31724   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
31725   }
31726 }
31727 
31728 /// Replace a node with an illegal result type with a new node built out of
31729 /// custom code.
31730 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31731                                            SmallVectorImpl<SDValue>&Results,
31732                                            SelectionDAG &DAG) const {
31733   SDLoc dl(N);
31734   switch (N->getOpcode()) {
31735   default:
31736 #ifndef NDEBUG
31737     dbgs() << "ReplaceNodeResults: ";
31738     N->dump(&DAG);
31739 #endif
31740     llvm_unreachable("Do not know how to custom type legalize this operation!");
31741   case X86ISD::CVTPH2PS: {
31742     EVT VT = N->getValueType(0);
31743     SDValue Lo, Hi;
31744     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31745     EVT LoVT, HiVT;
31746     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31747     Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31748     Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31749     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31750     Results.push_back(Res);
31751     return;
31752   }
31753   case X86ISD::STRICT_CVTPH2PS: {
31754     EVT VT = N->getValueType(0);
31755     SDValue Lo, Hi;
31756     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31757     EVT LoVT, HiVT;
31758     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31759     Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31760                      {N->getOperand(0), Lo});
31761     Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31762                      {N->getOperand(0), Hi});
31763     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31764                                 Lo.getValue(1), Hi.getValue(1));
31765     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31766     Results.push_back(Res);
31767     Results.push_back(Chain);
31768     return;
31769   }
31770   case X86ISD::CVTPS2PH:
31771     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
31772     return;
31773   case ISD::CTPOP: {
31774     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31775     // Use a v2i64 if possible.
31776     bool NoImplicitFloatOps =
31777         DAG.getMachineFunction().getFunction().hasFnAttribute(
31778             Attribute::NoImplicitFloat);
31779     if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
31780       SDValue Wide =
31781           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
31782       Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
31783       // Bit count should fit in 32-bits, extract it as that and then zero
31784       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
31785       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
31786       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
31787                          DAG.getIntPtrConstant(0, dl));
31788       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
31789       Results.push_back(Wide);
31790     }
31791     return;
31792   }
31793   case ISD::MUL: {
31794     EVT VT = N->getValueType(0);
31795     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31796            VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
31797     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
31798     // elements are needed.
31799     MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
31800     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
31801     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
31802     SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
31803     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31804     unsigned NumConcats = 16 / VT.getVectorNumElements();
31805     SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31806     ConcatOps[0] = Res;
31807     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
31808     Results.push_back(Res);
31809     return;
31810   }
31811   case X86ISD::VPMADDWD:
31812   case X86ISD::AVG: {
31813     // Legalize types for X86ISD::AVG/VPMADDWD by widening.
31814     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31815 
31816     EVT VT = N->getValueType(0);
31817     EVT InVT = N->getOperand(0).getValueType();
31818     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
31819            "Expected a VT that divides into 128 bits.");
31820     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31821            "Unexpected type action!");
31822     unsigned NumConcat = 128 / InVT.getSizeInBits();
31823 
31824     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
31825                                     InVT.getVectorElementType(),
31826                                     NumConcat * InVT.getVectorNumElements());
31827     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
31828                                   VT.getVectorElementType(),
31829                                   NumConcat * VT.getVectorNumElements());
31830 
31831     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
31832     Ops[0] = N->getOperand(0);
31833     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
31834     Ops[0] = N->getOperand(1);
31835     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
31836 
31837     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
31838     Results.push_back(Res);
31839     return;
31840   }
31841   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
31842   case X86ISD::FMINC:
31843   case X86ISD::FMIN:
31844   case X86ISD::FMAXC:
31845   case X86ISD::FMAX: {
31846     EVT VT = N->getValueType(0);
31847     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
31848     SDValue UNDEF = DAG.getUNDEF(VT);
31849     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
31850                               N->getOperand(0), UNDEF);
31851     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
31852                               N->getOperand(1), UNDEF);
31853     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
31854     return;
31855   }
31856   case ISD::SDIV:
31857   case ISD::UDIV:
31858   case ISD::SREM:
31859   case ISD::UREM: {
31860     EVT VT = N->getValueType(0);
31861     if (VT.isVector()) {
31862       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31863              "Unexpected type action!");
31864       // If this RHS is a constant splat vector we can widen this and let
31865       // division/remainder by constant optimize it.
31866       // TODO: Can we do something for non-splat?
31867       APInt SplatVal;
31868       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
31869         unsigned NumConcats = 128 / VT.getSizeInBits();
31870         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
31871         Ops0[0] = N->getOperand(0);
31872         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
31873         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
31874         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
31875         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
31876         Results.push_back(Res);
31877       }
31878       return;
31879     }
31880 
31881     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
31882     Results.push_back(V);
31883     return;
31884   }
31885   case ISD::TRUNCATE: {
31886     MVT VT = N->getSimpleValueType(0);
31887     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
31888       return;
31889 
31890     // The generic legalizer will try to widen the input type to the same
31891     // number of elements as the widened result type. But this isn't always
31892     // the best thing so do some custom legalization to avoid some cases.
31893     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
31894     SDValue In = N->getOperand(0);
31895     EVT InVT = In.getValueType();
31896 
31897     unsigned InBits = InVT.getSizeInBits();
31898     if (128 % InBits == 0) {
31899       // 128 bit and smaller inputs should avoid truncate all together and
31900       // just use a build_vector that will become a shuffle.
31901       // TODO: Widen and use a shuffle directly?
31902       MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
31903       EVT EltVT = VT.getVectorElementType();
31904       unsigned WidenNumElts = WidenVT.getVectorNumElements();
31905       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
31906       // Use the original element count so we don't do more scalar opts than
31907       // necessary.
31908       unsigned MinElts = VT.getVectorNumElements();
31909       for (unsigned i=0; i < MinElts; ++i) {
31910         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
31911                                   DAG.getIntPtrConstant(i, dl));
31912         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
31913       }
31914       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
31915       return;
31916     }
31917     // With AVX512 there are some cases that can use a target specific
31918     // truncate node to go from 256/512 to less than 128 with zeros in the
31919     // upper elements of the 128 bit result.
31920     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
31921       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
31922       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
31923         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
31924         return;
31925       }
31926       // There's one case we can widen to 512 bits and use VTRUNC.
31927       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
31928         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
31929                          DAG.getUNDEF(MVT::v4i64));
31930         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
31931         return;
31932       }
31933     }
31934     if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
31935         getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
31936         isTypeLegal(MVT::v4i64)) {
31937       // Input needs to be split and output needs to widened. Let's use two
31938       // VTRUNCs, and shuffle their results together into the wider type.
31939       SDValue Lo, Hi;
31940       std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
31941 
31942       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
31943       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
31944       SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
31945                                          { 0,  1,  2,  3, 16, 17, 18, 19,
31946                                           -1, -1, -1, -1, -1, -1, -1, -1 });
31947       Results.push_back(Res);
31948       return;
31949     }
31950 
31951     return;
31952   }
31953   case ISD::ANY_EXTEND:
31954     // Right now, only MVT::v8i8 has Custom action for an illegal type.
31955     // It's intended to custom handle the input type.
31956     assert(N->getValueType(0) == MVT::v8i8 &&
31957            "Do not know how to legalize this Node");
31958     return;
31959   case ISD::SIGN_EXTEND:
31960   case ISD::ZERO_EXTEND: {
31961     EVT VT = N->getValueType(0);
31962     SDValue In = N->getOperand(0);
31963     EVT InVT = In.getValueType();
31964     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
31965         (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
31966       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
31967              "Unexpected type action!");
31968       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
31969       // Custom split this so we can extend i8/i16->i32 invec. This is better
31970       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
31971       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
31972       // we allow the sra from the extend to i32 to be shared by the split.
31973       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
31974 
31975       // Fill a vector with sign bits for each element.
31976       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
31977       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
31978 
31979       // Create an unpackl and unpackh to interleave the sign bits then bitcast
31980       // to v2i64.
31981       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
31982                                         {0, 4, 1, 5});
31983       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
31984       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
31985                                         {2, 6, 3, 7});
31986       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
31987 
31988       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31989       Results.push_back(Res);
31990       return;
31991     }
31992 
31993     if (VT == MVT::v16i32 || VT == MVT::v8i64) {
31994       if (!InVT.is128BitVector()) {
31995         // Not a 128 bit vector, but maybe type legalization will promote
31996         // it to 128 bits.
31997         if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
31998           return;
31999         InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32000         if (!InVT.is128BitVector())
32001           return;
32002 
32003         // Promote the input to 128 bits. Type legalization will turn this into
32004         // zext_inreg/sext_inreg.
32005         In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32006       }
32007 
32008       // Perform custom splitting instead of the two stage extend we would get
32009       // by default.
32010       EVT LoVT, HiVT;
32011       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32012       assert(isTypeLegal(LoVT) && "Split VT not legal?");
32013 
32014       SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32015 
32016       // We need to shift the input over by half the number of elements.
32017       unsigned NumElts = InVT.getVectorNumElements();
32018       unsigned HalfNumElts = NumElts / 2;
32019       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32020       for (unsigned i = 0; i != HalfNumElts; ++i)
32021         ShufMask[i] = i + HalfNumElts;
32022 
32023       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32024       Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32025 
32026       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32027       Results.push_back(Res);
32028     }
32029     return;
32030   }
32031   case ISD::FP_TO_SINT:
32032   case ISD::STRICT_FP_TO_SINT:
32033   case ISD::FP_TO_UINT:
32034   case ISD::STRICT_FP_TO_UINT: {
32035     bool IsStrict = N->isStrictFPOpcode();
32036     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32037                     N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32038     EVT VT = N->getValueType(0);
32039     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32040     EVT SrcVT = Src.getValueType();
32041 
32042     if (VT.isVector() && Subtarget.hasFP16() &&
32043         SrcVT.getVectorElementType() == MVT::f16) {
32044       EVT EleVT = VT.getVectorElementType();
32045       EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32046 
32047       if (SrcVT != MVT::v8f16) {
32048         SDValue Tmp =
32049             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32050         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32051         Ops[0] = Src;
32052         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32053       }
32054 
32055       SDValue Res, Chain;
32056       if (IsStrict) {
32057         unsigned Opc =
32058             IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32059         Res =
32060             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32061         Chain = Res.getValue(1);
32062       } else {
32063         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32064         Res = DAG.getNode(Opc, dl, ResVT, Src);
32065       }
32066 
32067       // TODO: Need to add exception check code for strict FP.
32068       if (EleVT.getSizeInBits() < 16) {
32069         MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32070         Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32071 
32072         // Now widen to 128 bits.
32073         unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32074         MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32075         SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32076         ConcatOps[0] = Res;
32077         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32078       }
32079 
32080       Results.push_back(Res);
32081       if (IsStrict)
32082         Results.push_back(Chain);
32083 
32084       return;
32085     }
32086 
32087     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32088       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32089              "Unexpected type action!");
32090 
32091       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32092       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32093       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32094                                        VT.getVectorNumElements());
32095       SDValue Res;
32096       SDValue Chain;
32097       if (IsStrict) {
32098         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32099                           {N->getOperand(0), Src});
32100         Chain = Res.getValue(1);
32101       } else
32102         Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32103 
32104       // Preserve what we know about the size of the original result. If the
32105       // result is v2i32, we have to manually widen the assert.
32106       if (PromoteVT == MVT::v2i32)
32107         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32108                           DAG.getUNDEF(MVT::v2i32));
32109 
32110       Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32111                         Res.getValueType(), Res,
32112                         DAG.getValueType(VT.getVectorElementType()));
32113 
32114       if (PromoteVT == MVT::v2i32)
32115         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32116                           DAG.getIntPtrConstant(0, dl));
32117 
32118       // Truncate back to the original width.
32119       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32120 
32121       // Now widen to 128 bits.
32122       unsigned NumConcats = 128 / VT.getSizeInBits();
32123       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
32124                                       VT.getVectorNumElements() * NumConcats);
32125       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32126       ConcatOps[0] = Res;
32127       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32128       Results.push_back(Res);
32129       if (IsStrict)
32130         Results.push_back(Chain);
32131       return;
32132     }
32133 
32134 
32135     if (VT == MVT::v2i32) {
32136       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32137              "Strict unsigned conversion requires AVX512");
32138       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32139       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32140              "Unexpected type action!");
32141       if (Src.getValueType() == MVT::v2f64) {
32142         if (!IsSigned && !Subtarget.hasAVX512()) {
32143           SDValue Res =
32144               expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32145           Results.push_back(Res);
32146           return;
32147         }
32148 
32149         unsigned Opc;
32150         if (IsStrict)
32151           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32152         else
32153           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32154 
32155         // If we have VLX we can emit a target specific FP_TO_UINT node,.
32156         if (!IsSigned && !Subtarget.hasVLX()) {
32157           // Otherwise we can defer to the generic legalizer which will widen
32158           // the input as well. This will be further widened during op
32159           // legalization to v8i32<-v8f64.
32160           // For strict nodes we'll need to widen ourselves.
32161           // FIXME: Fix the type legalizer to safely widen strict nodes?
32162           if (!IsStrict)
32163             return;
32164           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32165                             DAG.getConstantFP(0.0, dl, MVT::v2f64));
32166           Opc = N->getOpcode();
32167         }
32168         SDValue Res;
32169         SDValue Chain;
32170         if (IsStrict) {
32171           Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32172                             {N->getOperand(0), Src});
32173           Chain = Res.getValue(1);
32174         } else {
32175           Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32176         }
32177         Results.push_back(Res);
32178         if (IsStrict)
32179           Results.push_back(Chain);
32180         return;
32181       }
32182 
32183       // Custom widen strict v2f32->v2i32 by padding with zeros.
32184       // FIXME: Should generic type legalizer do this?
32185       if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32186         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32187                           DAG.getConstantFP(0.0, dl, MVT::v2f32));
32188         SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32189                                   {N->getOperand(0), Src});
32190         Results.push_back(Res);
32191         Results.push_back(Res.getValue(1));
32192         return;
32193       }
32194 
32195       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32196       // so early out here.
32197       return;
32198     }
32199 
32200     assert(!VT.isVector() && "Vectors should have been handled above!");
32201 
32202     if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32203          (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32204         (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32205       assert(!Subtarget.is64Bit() && "i64 should be legal");
32206       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32207       // If we use a 128-bit result we might need to use a target specific node.
32208       unsigned SrcElts =
32209           std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32210       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32211       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32212       unsigned Opc = N->getOpcode();
32213       if (NumElts != SrcElts) {
32214         if (IsStrict)
32215           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32216         else
32217           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32218       }
32219 
32220       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32221       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32222                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32223                                 ZeroIdx);
32224       SDValue Chain;
32225       if (IsStrict) {
32226         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32227         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32228         Chain = Res.getValue(1);
32229       } else
32230         Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32231       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32232       Results.push_back(Res);
32233       if (IsStrict)
32234         Results.push_back(Chain);
32235       return;
32236     }
32237 
32238     if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32239       SDValue Chain;
32240       SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32241       Results.push_back(V);
32242       if (IsStrict)
32243         Results.push_back(Chain);
32244       return;
32245     }
32246 
32247     SDValue Chain;
32248     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32249       Results.push_back(V);
32250       if (IsStrict)
32251         Results.push_back(Chain);
32252     }
32253     return;
32254   }
32255   case ISD::LRINT:
32256   case ISD::LLRINT: {
32257     if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32258       Results.push_back(V);
32259     return;
32260   }
32261 
32262   case ISD::SINT_TO_FP:
32263   case ISD::STRICT_SINT_TO_FP:
32264   case ISD::UINT_TO_FP:
32265   case ISD::STRICT_UINT_TO_FP: {
32266     bool IsStrict = N->isStrictFPOpcode();
32267     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32268                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32269     EVT VT = N->getValueType(0);
32270     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32271     if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32272         Subtarget.hasVLX()) {
32273       if (Src.getValueType().getVectorElementType() == MVT::i16)
32274         return;
32275 
32276       if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32277         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32278                           IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32279                                    : DAG.getUNDEF(MVT::v2i32));
32280       if (IsStrict) {
32281         unsigned Opc =
32282             IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
32283         SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32284                                   {N->getOperand(0), Src});
32285         Results.push_back(Res);
32286         Results.push_back(Res.getValue(1));
32287       } else {
32288         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32289         Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32290       }
32291       return;
32292     }
32293     if (VT != MVT::v2f32)
32294       return;
32295     EVT SrcVT = Src.getValueType();
32296     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32297       if (IsStrict) {
32298         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32299                                 : X86ISD::STRICT_CVTUI2P;
32300         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32301                                   {N->getOperand(0), Src});
32302         Results.push_back(Res);
32303         Results.push_back(Res.getValue(1));
32304       } else {
32305         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32306         Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
32307       }
32308       return;
32309     }
32310     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
32311         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
32312       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
32313       SDValue One  = DAG.getConstant(1, dl, SrcVT);
32314       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
32315                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
32316                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
32317       SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
32318       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
32319       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
32320       for (int i = 0; i != 2; ++i) {
32321         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
32322                                   SignSrc, DAG.getIntPtrConstant(i, dl));
32323         if (IsStrict)
32324           SignCvts[i] =
32325               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
32326                           {N->getOperand(0), Elt});
32327         else
32328           SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
32329       };
32330       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
32331       SDValue Slow, Chain;
32332       if (IsStrict) {
32333         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32334                             SignCvts[0].getValue(1), SignCvts[1].getValue(1));
32335         Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
32336                            {Chain, SignCvt, SignCvt});
32337         Chain = Slow.getValue(1);
32338       } else {
32339         Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
32340       }
32341       IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
32342       IsNeg =
32343           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
32344       SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
32345       Results.push_back(Cvt);
32346       if (IsStrict)
32347         Results.push_back(Chain);
32348       return;
32349     }
32350 
32351     if (SrcVT != MVT::v2i32)
32352       return;
32353 
32354     if (IsSigned || Subtarget.hasAVX512()) {
32355       if (!IsStrict)
32356         return;
32357 
32358       // Custom widen strict v2i32->v2f32 to avoid scalarization.
32359       // FIXME: Should generic type legalizer do this?
32360       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32361                         DAG.getConstant(0, dl, MVT::v2i32));
32362       SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
32363                                 {N->getOperand(0), Src});
32364       Results.push_back(Res);
32365       Results.push_back(Res.getValue(1));
32366       return;
32367     }
32368 
32369     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32370     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
32371     SDValue VBias =
32372         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
32373     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
32374                              DAG.getBitcast(MVT::v2i64, VBias));
32375     Or = DAG.getBitcast(MVT::v2f64, Or);
32376     if (IsStrict) {
32377       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
32378                                 {N->getOperand(0), Or, VBias});
32379       SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
32380                                 {MVT::v4f32, MVT::Other},
32381                                 {Sub.getValue(1), Sub});
32382       Results.push_back(Res);
32383       Results.push_back(Res.getValue(1));
32384     } else {
32385       // TODO: Are there any fast-math-flags to propagate here?
32386       SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
32387       Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
32388     }
32389     return;
32390   }
32391   case ISD::STRICT_FP_ROUND:
32392   case ISD::FP_ROUND: {
32393     bool IsStrict = N->isStrictFPOpcode();
32394     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32395     EVT VT = N->getValueType(0);
32396     EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
32397     if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
32398       SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
32399                              : DAG.getUNDEF(MVT::v2f32);
32400       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
32401     }
32402     if (!isTypeLegal(Src.getValueType()))
32403       return;
32404     SDValue V;
32405     if (IsStrict)
32406       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
32407                       {N->getOperand(0), Src});
32408     else
32409       V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
32410     Results.push_back(V);
32411     if (IsStrict)
32412       Results.push_back(V.getValue(1));
32413     return;
32414   }
32415   case ISD::FP_EXTEND:
32416   case ISD::STRICT_FP_EXTEND: {
32417     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
32418     // No other ValueType for FP_EXTEND should reach this point.
32419     assert(N->getValueType(0) == MVT::v2f32 &&
32420            "Do not know how to legalize this Node");
32421     if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
32422       return;
32423     bool IsStrict = N->isStrictFPOpcode();
32424     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32425     SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
32426                            : DAG.getUNDEF(MVT::v2f16);
32427     SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
32428     if (IsStrict)
32429       V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
32430                       {N->getOperand(0), V});
32431     else
32432       V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
32433     Results.push_back(V);
32434     if (IsStrict)
32435       Results.push_back(V.getValue(1));
32436     return;
32437   }
32438   case ISD::INTRINSIC_W_CHAIN: {
32439     unsigned IntNo = N->getConstantOperandVal(1);
32440     switch (IntNo) {
32441     default : llvm_unreachable("Do not know how to custom type "
32442                                "legalize this intrinsic operation!");
32443     case Intrinsic::x86_rdtsc:
32444       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
32445                                      Results);
32446     case Intrinsic::x86_rdtscp:
32447       return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
32448                                      Results);
32449     case Intrinsic::x86_rdpmc:
32450       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
32451                                   Results);
32452       return;
32453     case Intrinsic::x86_xgetbv:
32454       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
32455                                   Results);
32456       return;
32457     }
32458   }
32459   case ISD::READCYCLECOUNTER: {
32460     return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
32461   }
32462   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
32463     EVT T = N->getValueType(0);
32464     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
32465     bool Regs64bit = T == MVT::i128;
32466     assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
32467            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
32468     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
32469     SDValue cpInL, cpInH;
32470     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
32471                         DAG.getConstant(0, dl, HalfT));
32472     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
32473                         DAG.getConstant(1, dl, HalfT));
32474     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
32475                              Regs64bit ? X86::RAX : X86::EAX,
32476                              cpInL, SDValue());
32477     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
32478                              Regs64bit ? X86::RDX : X86::EDX,
32479                              cpInH, cpInL.getValue(1));
32480     SDValue swapInL, swapInH;
32481     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
32482                           DAG.getConstant(0, dl, HalfT));
32483     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
32484                           DAG.getConstant(1, dl, HalfT));
32485     swapInH =
32486         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
32487                          swapInH, cpInH.getValue(1));
32488 
32489     // In 64-bit mode we might need the base pointer in RBX, but we can't know
32490     // until later. So we keep the RBX input in a vreg and use a custom
32491     // inserter.
32492     // Since RBX will be a reserved register the register allocator will not
32493     // make sure its value will be properly saved and restored around this
32494     // live-range.
32495     SDValue Result;
32496     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32497     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
32498     if (Regs64bit) {
32499       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
32500                        swapInH.getValue(1)};
32501       Result =
32502           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
32503     } else {
32504       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
32505                                  swapInH.getValue(1));
32506       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
32507                        swapInL.getValue(1)};
32508       Result =
32509           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
32510     }
32511 
32512     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
32513                                         Regs64bit ? X86::RAX : X86::EAX,
32514                                         HalfT, Result.getValue(1));
32515     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
32516                                         Regs64bit ? X86::RDX : X86::EDX,
32517                                         HalfT, cpOutL.getValue(2));
32518     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
32519 
32520     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
32521                                         MVT::i32, cpOutH.getValue(2));
32522     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
32523     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
32524 
32525     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
32526     Results.push_back(Success);
32527     Results.push_back(EFLAGS.getValue(1));
32528     return;
32529   }
32530   case ISD::ATOMIC_LOAD: {
32531     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32532     bool NoImplicitFloatOps =
32533         DAG.getMachineFunction().getFunction().hasFnAttribute(
32534             Attribute::NoImplicitFloat);
32535     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
32536       auto *Node = cast<AtomicSDNode>(N);
32537       if (Subtarget.hasSSE1()) {
32538         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
32539         // Then extract the lower 64-bits.
32540         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32541         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
32542         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32543         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32544                                              MVT::i64, Node->getMemOperand());
32545         if (Subtarget.hasSSE2()) {
32546           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
32547                                     DAG.getIntPtrConstant(0, dl));
32548           Results.push_back(Res);
32549           Results.push_back(Ld.getValue(1));
32550           return;
32551         }
32552         // We use an alternative sequence for SSE1 that extracts as v2f32 and
32553         // then casts to i64. This avoids a 128-bit stack temporary being
32554         // created by type legalization if we were to cast v4f32->v2i64.
32555         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
32556                                   DAG.getIntPtrConstant(0, dl));
32557         Res = DAG.getBitcast(MVT::i64, Res);
32558         Results.push_back(Res);
32559         Results.push_back(Ld.getValue(1));
32560         return;
32561       }
32562       if (Subtarget.hasX87()) {
32563         // First load this into an 80-bit X87 register. This will put the whole
32564         // integer into the significand.
32565         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32566         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32567         SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
32568                                                  dl, Tys, Ops, MVT::i64,
32569                                                  Node->getMemOperand());
32570         SDValue Chain = Result.getValue(1);
32571 
32572         // Now store the X87 register to a stack temporary and convert to i64.
32573         // This store is not atomic and doesn't need to be.
32574         // FIXME: We don't need a stack temporary if the result of the load
32575         // is already being stored. We could just directly store there.
32576         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32577         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32578         MachinePointerInfo MPI =
32579             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
32580         SDValue StoreOps[] = { Chain, Result, StackPtr };
32581         Chain = DAG.getMemIntrinsicNode(
32582             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
32583             MPI, None /*Align*/, MachineMemOperand::MOStore);
32584 
32585         // Finally load the value back from the stack temporary and return it.
32586         // This load is not atomic and doesn't need to be.
32587         // This load will be further type legalized.
32588         Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
32589         Results.push_back(Result);
32590         Results.push_back(Result.getValue(1));
32591         return;
32592       }
32593     }
32594     // TODO: Use MOVLPS when SSE1 is available?
32595     // Delegate to generic TypeLegalization. Situations we can really handle
32596     // should have already been dealt with by AtomicExpandPass.cpp.
32597     break;
32598   }
32599   case ISD::ATOMIC_SWAP:
32600   case ISD::ATOMIC_LOAD_ADD:
32601   case ISD::ATOMIC_LOAD_SUB:
32602   case ISD::ATOMIC_LOAD_AND:
32603   case ISD::ATOMIC_LOAD_OR:
32604   case ISD::ATOMIC_LOAD_XOR:
32605   case ISD::ATOMIC_LOAD_NAND:
32606   case ISD::ATOMIC_LOAD_MIN:
32607   case ISD::ATOMIC_LOAD_MAX:
32608   case ISD::ATOMIC_LOAD_UMIN:
32609   case ISD::ATOMIC_LOAD_UMAX:
32610     // Delegate to generic TypeLegalization. Situations we can really handle
32611     // should have already been dealt with by AtomicExpandPass.cpp.
32612     break;
32613 
32614   case ISD::BITCAST: {
32615     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32616     EVT DstVT = N->getValueType(0);
32617     EVT SrcVT = N->getOperand(0).getValueType();
32618 
32619     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
32620     // we can split using the k-register rather than memory.
32621     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
32622       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32623       SDValue Lo, Hi;
32624       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32625       Lo = DAG.getBitcast(MVT::i32, Lo);
32626       Hi = DAG.getBitcast(MVT::i32, Hi);
32627       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
32628       Results.push_back(Res);
32629       return;
32630     }
32631 
32632     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
32633       // FIXME: Use v4f32 for SSE1?
32634       assert(Subtarget.hasSSE2() && "Requires SSE2");
32635       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
32636              "Unexpected type action!");
32637       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
32638       SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
32639                                 N->getOperand(0));
32640       Res = DAG.getBitcast(WideVT, Res);
32641       Results.push_back(Res);
32642       return;
32643     }
32644 
32645     return;
32646   }
32647   case ISD::MGATHER: {
32648     EVT VT = N->getValueType(0);
32649     if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32650         (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32651       auto *Gather = cast<MaskedGatherSDNode>(N);
32652       SDValue Index = Gather->getIndex();
32653       if (Index.getValueType() != MVT::v2i64)
32654         return;
32655       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32656              "Unexpected type action!");
32657       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32658       SDValue Mask = Gather->getMask();
32659       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
32660       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32661                                      Gather->getPassThru(),
32662                                      DAG.getUNDEF(VT));
32663       if (!Subtarget.hasVLX()) {
32664         // We need to widen the mask, but the instruction will only use 2
32665         // of its elements. So we can use undef.
32666         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32667                            DAG.getUNDEF(MVT::v2i1));
32668         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32669       }
32670       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32671                         Gather->getBasePtr(), Index, Gather->getScale() };
32672       SDValue Res = DAG.getMemIntrinsicNode(
32673           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32674           Gather->getMemoryVT(), Gather->getMemOperand());
32675       Results.push_back(Res);
32676       Results.push_back(Res.getValue(1));
32677       return;
32678     }
32679     return;
32680   }
32681   case ISD::LOAD: {
32682     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32683     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32684     // cast since type legalization will try to use an i64 load.
32685     MVT VT = N->getSimpleValueType(0);
32686     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
32687     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32688            "Unexpected type action!");
32689     if (!ISD::isNON_EXTLoad(N))
32690       return;
32691     auto *Ld = cast<LoadSDNode>(N);
32692     if (Subtarget.hasSSE2()) {
32693       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32694       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32695                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32696                                 Ld->getMemOperand()->getFlags());
32697       SDValue Chain = Res.getValue(1);
32698       MVT VecVT = MVT::getVectorVT(LdVT, 2);
32699       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32700       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32701       Res = DAG.getBitcast(WideVT, Res);
32702       Results.push_back(Res);
32703       Results.push_back(Chain);
32704       return;
32705     }
32706     assert(Subtarget.hasSSE1() && "Expected SSE");
32707     SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32708     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32709     SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32710                                           MVT::i64, Ld->getMemOperand());
32711     Results.push_back(Res);
32712     Results.push_back(Res.getValue(1));
32713     return;
32714   }
32715   case ISD::ADDRSPACECAST: {
32716     SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32717     Results.push_back(V);
32718     return;
32719   }
32720   case ISD::BITREVERSE:
32721     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32722     assert(Subtarget.hasXOP() && "Expected XOP");
32723     // We can use VPPERM by copying to a vector register and back. We'll need
32724     // to move the scalar in two i32 pieces.
32725     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32726     return;
32727   }
32728 }
32729 
32730 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
32731   switch ((X86ISD::NodeType)Opcode) {
32732   case X86ISD::FIRST_NUMBER:       break;
32733 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
32734   NODE_NAME_CASE(BSF)
32735   NODE_NAME_CASE(BSR)
32736   NODE_NAME_CASE(FSHL)
32737   NODE_NAME_CASE(FSHR)
32738   NODE_NAME_CASE(FAND)
32739   NODE_NAME_CASE(FANDN)
32740   NODE_NAME_CASE(FOR)
32741   NODE_NAME_CASE(FXOR)
32742   NODE_NAME_CASE(FILD)
32743   NODE_NAME_CASE(FIST)
32744   NODE_NAME_CASE(FP_TO_INT_IN_MEM)
32745   NODE_NAME_CASE(FLD)
32746   NODE_NAME_CASE(FST)
32747   NODE_NAME_CASE(CALL)
32748   NODE_NAME_CASE(CALL_RVMARKER)
32749   NODE_NAME_CASE(BT)
32750   NODE_NAME_CASE(CMP)
32751   NODE_NAME_CASE(FCMP)
32752   NODE_NAME_CASE(STRICT_FCMP)
32753   NODE_NAME_CASE(STRICT_FCMPS)
32754   NODE_NAME_CASE(COMI)
32755   NODE_NAME_CASE(UCOMI)
32756   NODE_NAME_CASE(CMPM)
32757   NODE_NAME_CASE(CMPMM)
32758   NODE_NAME_CASE(STRICT_CMPM)
32759   NODE_NAME_CASE(CMPMM_SAE)
32760   NODE_NAME_CASE(SETCC)
32761   NODE_NAME_CASE(SETCC_CARRY)
32762   NODE_NAME_CASE(FSETCC)
32763   NODE_NAME_CASE(FSETCCM)
32764   NODE_NAME_CASE(FSETCCM_SAE)
32765   NODE_NAME_CASE(CMOV)
32766   NODE_NAME_CASE(BRCOND)
32767   NODE_NAME_CASE(RET_FLAG)
32768   NODE_NAME_CASE(IRET)
32769   NODE_NAME_CASE(REP_STOS)
32770   NODE_NAME_CASE(REP_MOVS)
32771   NODE_NAME_CASE(GlobalBaseReg)
32772   NODE_NAME_CASE(Wrapper)
32773   NODE_NAME_CASE(WrapperRIP)
32774   NODE_NAME_CASE(MOVQ2DQ)
32775   NODE_NAME_CASE(MOVDQ2Q)
32776   NODE_NAME_CASE(MMX_MOVD2W)
32777   NODE_NAME_CASE(MMX_MOVW2D)
32778   NODE_NAME_CASE(PEXTRB)
32779   NODE_NAME_CASE(PEXTRW)
32780   NODE_NAME_CASE(INSERTPS)
32781   NODE_NAME_CASE(PINSRB)
32782   NODE_NAME_CASE(PINSRW)
32783   NODE_NAME_CASE(PSHUFB)
32784   NODE_NAME_CASE(ANDNP)
32785   NODE_NAME_CASE(BLENDI)
32786   NODE_NAME_CASE(BLENDV)
32787   NODE_NAME_CASE(HADD)
32788   NODE_NAME_CASE(HSUB)
32789   NODE_NAME_CASE(FHADD)
32790   NODE_NAME_CASE(FHSUB)
32791   NODE_NAME_CASE(CONFLICT)
32792   NODE_NAME_CASE(FMAX)
32793   NODE_NAME_CASE(FMAXS)
32794   NODE_NAME_CASE(FMAX_SAE)
32795   NODE_NAME_CASE(FMAXS_SAE)
32796   NODE_NAME_CASE(FMIN)
32797   NODE_NAME_CASE(FMINS)
32798   NODE_NAME_CASE(FMIN_SAE)
32799   NODE_NAME_CASE(FMINS_SAE)
32800   NODE_NAME_CASE(FMAXC)
32801   NODE_NAME_CASE(FMINC)
32802   NODE_NAME_CASE(FRSQRT)
32803   NODE_NAME_CASE(FRCP)
32804   NODE_NAME_CASE(EXTRQI)
32805   NODE_NAME_CASE(INSERTQI)
32806   NODE_NAME_CASE(TLSADDR)
32807   NODE_NAME_CASE(TLSBASEADDR)
32808   NODE_NAME_CASE(TLSCALL)
32809   NODE_NAME_CASE(EH_SJLJ_SETJMP)
32810   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
32811   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
32812   NODE_NAME_CASE(EH_RETURN)
32813   NODE_NAME_CASE(TC_RETURN)
32814   NODE_NAME_CASE(FNSTCW16m)
32815   NODE_NAME_CASE(FLDCW16m)
32816   NODE_NAME_CASE(LCMPXCHG_DAG)
32817   NODE_NAME_CASE(LCMPXCHG8_DAG)
32818   NODE_NAME_CASE(LCMPXCHG16_DAG)
32819   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
32820   NODE_NAME_CASE(LADD)
32821   NODE_NAME_CASE(LSUB)
32822   NODE_NAME_CASE(LOR)
32823   NODE_NAME_CASE(LXOR)
32824   NODE_NAME_CASE(LAND)
32825   NODE_NAME_CASE(VZEXT_MOVL)
32826   NODE_NAME_CASE(VZEXT_LOAD)
32827   NODE_NAME_CASE(VEXTRACT_STORE)
32828   NODE_NAME_CASE(VTRUNC)
32829   NODE_NAME_CASE(VTRUNCS)
32830   NODE_NAME_CASE(VTRUNCUS)
32831   NODE_NAME_CASE(VMTRUNC)
32832   NODE_NAME_CASE(VMTRUNCS)
32833   NODE_NAME_CASE(VMTRUNCUS)
32834   NODE_NAME_CASE(VTRUNCSTORES)
32835   NODE_NAME_CASE(VTRUNCSTOREUS)
32836   NODE_NAME_CASE(VMTRUNCSTORES)
32837   NODE_NAME_CASE(VMTRUNCSTOREUS)
32838   NODE_NAME_CASE(VFPEXT)
32839   NODE_NAME_CASE(STRICT_VFPEXT)
32840   NODE_NAME_CASE(VFPEXT_SAE)
32841   NODE_NAME_CASE(VFPEXTS)
32842   NODE_NAME_CASE(VFPEXTS_SAE)
32843   NODE_NAME_CASE(VFPROUND)
32844   NODE_NAME_CASE(STRICT_VFPROUND)
32845   NODE_NAME_CASE(VMFPROUND)
32846   NODE_NAME_CASE(VFPROUND_RND)
32847   NODE_NAME_CASE(VFPROUNDS)
32848   NODE_NAME_CASE(VFPROUNDS_RND)
32849   NODE_NAME_CASE(VSHLDQ)
32850   NODE_NAME_CASE(VSRLDQ)
32851   NODE_NAME_CASE(VSHL)
32852   NODE_NAME_CASE(VSRL)
32853   NODE_NAME_CASE(VSRA)
32854   NODE_NAME_CASE(VSHLI)
32855   NODE_NAME_CASE(VSRLI)
32856   NODE_NAME_CASE(VSRAI)
32857   NODE_NAME_CASE(VSHLV)
32858   NODE_NAME_CASE(VSRLV)
32859   NODE_NAME_CASE(VSRAV)
32860   NODE_NAME_CASE(VROTLI)
32861   NODE_NAME_CASE(VROTRI)
32862   NODE_NAME_CASE(VPPERM)
32863   NODE_NAME_CASE(CMPP)
32864   NODE_NAME_CASE(STRICT_CMPP)
32865   NODE_NAME_CASE(PCMPEQ)
32866   NODE_NAME_CASE(PCMPGT)
32867   NODE_NAME_CASE(PHMINPOS)
32868   NODE_NAME_CASE(ADD)
32869   NODE_NAME_CASE(SUB)
32870   NODE_NAME_CASE(ADC)
32871   NODE_NAME_CASE(SBB)
32872   NODE_NAME_CASE(SMUL)
32873   NODE_NAME_CASE(UMUL)
32874   NODE_NAME_CASE(OR)
32875   NODE_NAME_CASE(XOR)
32876   NODE_NAME_CASE(AND)
32877   NODE_NAME_CASE(BEXTR)
32878   NODE_NAME_CASE(BEXTRI)
32879   NODE_NAME_CASE(BZHI)
32880   NODE_NAME_CASE(PDEP)
32881   NODE_NAME_CASE(PEXT)
32882   NODE_NAME_CASE(MUL_IMM)
32883   NODE_NAME_CASE(MOVMSK)
32884   NODE_NAME_CASE(PTEST)
32885   NODE_NAME_CASE(TESTP)
32886   NODE_NAME_CASE(KORTEST)
32887   NODE_NAME_CASE(KTEST)
32888   NODE_NAME_CASE(KADD)
32889   NODE_NAME_CASE(KSHIFTL)
32890   NODE_NAME_CASE(KSHIFTR)
32891   NODE_NAME_CASE(PACKSS)
32892   NODE_NAME_CASE(PACKUS)
32893   NODE_NAME_CASE(PALIGNR)
32894   NODE_NAME_CASE(VALIGN)
32895   NODE_NAME_CASE(VSHLD)
32896   NODE_NAME_CASE(VSHRD)
32897   NODE_NAME_CASE(VSHLDV)
32898   NODE_NAME_CASE(VSHRDV)
32899   NODE_NAME_CASE(PSHUFD)
32900   NODE_NAME_CASE(PSHUFHW)
32901   NODE_NAME_CASE(PSHUFLW)
32902   NODE_NAME_CASE(SHUFP)
32903   NODE_NAME_CASE(SHUF128)
32904   NODE_NAME_CASE(MOVLHPS)
32905   NODE_NAME_CASE(MOVHLPS)
32906   NODE_NAME_CASE(MOVDDUP)
32907   NODE_NAME_CASE(MOVSHDUP)
32908   NODE_NAME_CASE(MOVSLDUP)
32909   NODE_NAME_CASE(MOVSD)
32910   NODE_NAME_CASE(MOVSS)
32911   NODE_NAME_CASE(MOVSH)
32912   NODE_NAME_CASE(UNPCKL)
32913   NODE_NAME_CASE(UNPCKH)
32914   NODE_NAME_CASE(VBROADCAST)
32915   NODE_NAME_CASE(VBROADCAST_LOAD)
32916   NODE_NAME_CASE(VBROADCASTM)
32917   NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
32918   NODE_NAME_CASE(VPERMILPV)
32919   NODE_NAME_CASE(VPERMILPI)
32920   NODE_NAME_CASE(VPERM2X128)
32921   NODE_NAME_CASE(VPERMV)
32922   NODE_NAME_CASE(VPERMV3)
32923   NODE_NAME_CASE(VPERMI)
32924   NODE_NAME_CASE(VPTERNLOG)
32925   NODE_NAME_CASE(VFIXUPIMM)
32926   NODE_NAME_CASE(VFIXUPIMM_SAE)
32927   NODE_NAME_CASE(VFIXUPIMMS)
32928   NODE_NAME_CASE(VFIXUPIMMS_SAE)
32929   NODE_NAME_CASE(VRANGE)
32930   NODE_NAME_CASE(VRANGE_SAE)
32931   NODE_NAME_CASE(VRANGES)
32932   NODE_NAME_CASE(VRANGES_SAE)
32933   NODE_NAME_CASE(PMULUDQ)
32934   NODE_NAME_CASE(PMULDQ)
32935   NODE_NAME_CASE(PSADBW)
32936   NODE_NAME_CASE(DBPSADBW)
32937   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
32938   NODE_NAME_CASE(VAARG_64)
32939   NODE_NAME_CASE(VAARG_X32)
32940   NODE_NAME_CASE(DYN_ALLOCA)
32941   NODE_NAME_CASE(MEMBARRIER)
32942   NODE_NAME_CASE(MFENCE)
32943   NODE_NAME_CASE(SEG_ALLOCA)
32944   NODE_NAME_CASE(PROBED_ALLOCA)
32945   NODE_NAME_CASE(RDRAND)
32946   NODE_NAME_CASE(RDSEED)
32947   NODE_NAME_CASE(RDPKRU)
32948   NODE_NAME_CASE(WRPKRU)
32949   NODE_NAME_CASE(VPMADDUBSW)
32950   NODE_NAME_CASE(VPMADDWD)
32951   NODE_NAME_CASE(VPSHA)
32952   NODE_NAME_CASE(VPSHL)
32953   NODE_NAME_CASE(VPCOM)
32954   NODE_NAME_CASE(VPCOMU)
32955   NODE_NAME_CASE(VPERMIL2)
32956   NODE_NAME_CASE(FMSUB)
32957   NODE_NAME_CASE(STRICT_FMSUB)
32958   NODE_NAME_CASE(FNMADD)
32959   NODE_NAME_CASE(STRICT_FNMADD)
32960   NODE_NAME_CASE(FNMSUB)
32961   NODE_NAME_CASE(STRICT_FNMSUB)
32962   NODE_NAME_CASE(FMADDSUB)
32963   NODE_NAME_CASE(FMSUBADD)
32964   NODE_NAME_CASE(FMADD_RND)
32965   NODE_NAME_CASE(FNMADD_RND)
32966   NODE_NAME_CASE(FMSUB_RND)
32967   NODE_NAME_CASE(FNMSUB_RND)
32968   NODE_NAME_CASE(FMADDSUB_RND)
32969   NODE_NAME_CASE(FMSUBADD_RND)
32970   NODE_NAME_CASE(VFMADDC)
32971   NODE_NAME_CASE(VFMADDC_RND)
32972   NODE_NAME_CASE(VFCMADDC)
32973   NODE_NAME_CASE(VFCMADDC_RND)
32974   NODE_NAME_CASE(VFMULC)
32975   NODE_NAME_CASE(VFMULC_RND)
32976   NODE_NAME_CASE(VFCMULC)
32977   NODE_NAME_CASE(VFCMULC_RND)
32978   NODE_NAME_CASE(VFMULCSH)
32979   NODE_NAME_CASE(VFMULCSH_RND)
32980   NODE_NAME_CASE(VFCMULCSH)
32981   NODE_NAME_CASE(VFCMULCSH_RND)
32982   NODE_NAME_CASE(VFMADDCSH)
32983   NODE_NAME_CASE(VFMADDCSH_RND)
32984   NODE_NAME_CASE(VFCMADDCSH)
32985   NODE_NAME_CASE(VFCMADDCSH_RND)
32986   NODE_NAME_CASE(VPMADD52H)
32987   NODE_NAME_CASE(VPMADD52L)
32988   NODE_NAME_CASE(VRNDSCALE)
32989   NODE_NAME_CASE(STRICT_VRNDSCALE)
32990   NODE_NAME_CASE(VRNDSCALE_SAE)
32991   NODE_NAME_CASE(VRNDSCALES)
32992   NODE_NAME_CASE(VRNDSCALES_SAE)
32993   NODE_NAME_CASE(VREDUCE)
32994   NODE_NAME_CASE(VREDUCE_SAE)
32995   NODE_NAME_CASE(VREDUCES)
32996   NODE_NAME_CASE(VREDUCES_SAE)
32997   NODE_NAME_CASE(VGETMANT)
32998   NODE_NAME_CASE(VGETMANT_SAE)
32999   NODE_NAME_CASE(VGETMANTS)
33000   NODE_NAME_CASE(VGETMANTS_SAE)
33001   NODE_NAME_CASE(PCMPESTR)
33002   NODE_NAME_CASE(PCMPISTR)
33003   NODE_NAME_CASE(XTEST)
33004   NODE_NAME_CASE(COMPRESS)
33005   NODE_NAME_CASE(EXPAND)
33006   NODE_NAME_CASE(SELECTS)
33007   NODE_NAME_CASE(ADDSUB)
33008   NODE_NAME_CASE(RCP14)
33009   NODE_NAME_CASE(RCP14S)
33010   NODE_NAME_CASE(RCP28)
33011   NODE_NAME_CASE(RCP28_SAE)
33012   NODE_NAME_CASE(RCP28S)
33013   NODE_NAME_CASE(RCP28S_SAE)
33014   NODE_NAME_CASE(EXP2)
33015   NODE_NAME_CASE(EXP2_SAE)
33016   NODE_NAME_CASE(RSQRT14)
33017   NODE_NAME_CASE(RSQRT14S)
33018   NODE_NAME_CASE(RSQRT28)
33019   NODE_NAME_CASE(RSQRT28_SAE)
33020   NODE_NAME_CASE(RSQRT28S)
33021   NODE_NAME_CASE(RSQRT28S_SAE)
33022   NODE_NAME_CASE(FADD_RND)
33023   NODE_NAME_CASE(FADDS)
33024   NODE_NAME_CASE(FADDS_RND)
33025   NODE_NAME_CASE(FSUB_RND)
33026   NODE_NAME_CASE(FSUBS)
33027   NODE_NAME_CASE(FSUBS_RND)
33028   NODE_NAME_CASE(FMUL_RND)
33029   NODE_NAME_CASE(FMULS)
33030   NODE_NAME_CASE(FMULS_RND)
33031   NODE_NAME_CASE(FDIV_RND)
33032   NODE_NAME_CASE(FDIVS)
33033   NODE_NAME_CASE(FDIVS_RND)
33034   NODE_NAME_CASE(FSQRT_RND)
33035   NODE_NAME_CASE(FSQRTS)
33036   NODE_NAME_CASE(FSQRTS_RND)
33037   NODE_NAME_CASE(FGETEXP)
33038   NODE_NAME_CASE(FGETEXP_SAE)
33039   NODE_NAME_CASE(FGETEXPS)
33040   NODE_NAME_CASE(FGETEXPS_SAE)
33041   NODE_NAME_CASE(SCALEF)
33042   NODE_NAME_CASE(SCALEF_RND)
33043   NODE_NAME_CASE(SCALEFS)
33044   NODE_NAME_CASE(SCALEFS_RND)
33045   NODE_NAME_CASE(AVG)
33046   NODE_NAME_CASE(MULHRS)
33047   NODE_NAME_CASE(SINT_TO_FP_RND)
33048   NODE_NAME_CASE(UINT_TO_FP_RND)
33049   NODE_NAME_CASE(CVTTP2SI)
33050   NODE_NAME_CASE(CVTTP2UI)
33051   NODE_NAME_CASE(STRICT_CVTTP2SI)
33052   NODE_NAME_CASE(STRICT_CVTTP2UI)
33053   NODE_NAME_CASE(MCVTTP2SI)
33054   NODE_NAME_CASE(MCVTTP2UI)
33055   NODE_NAME_CASE(CVTTP2SI_SAE)
33056   NODE_NAME_CASE(CVTTP2UI_SAE)
33057   NODE_NAME_CASE(CVTTS2SI)
33058   NODE_NAME_CASE(CVTTS2UI)
33059   NODE_NAME_CASE(CVTTS2SI_SAE)
33060   NODE_NAME_CASE(CVTTS2UI_SAE)
33061   NODE_NAME_CASE(CVTSI2P)
33062   NODE_NAME_CASE(CVTUI2P)
33063   NODE_NAME_CASE(STRICT_CVTSI2P)
33064   NODE_NAME_CASE(STRICT_CVTUI2P)
33065   NODE_NAME_CASE(MCVTSI2P)
33066   NODE_NAME_CASE(MCVTUI2P)
33067   NODE_NAME_CASE(VFPCLASS)
33068   NODE_NAME_CASE(VFPCLASSS)
33069   NODE_NAME_CASE(MULTISHIFT)
33070   NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33071   NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33072   NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33073   NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33074   NODE_NAME_CASE(CVTPS2PH)
33075   NODE_NAME_CASE(STRICT_CVTPS2PH)
33076   NODE_NAME_CASE(MCVTPS2PH)
33077   NODE_NAME_CASE(CVTPH2PS)
33078   NODE_NAME_CASE(STRICT_CVTPH2PS)
33079   NODE_NAME_CASE(CVTPH2PS_SAE)
33080   NODE_NAME_CASE(CVTP2SI)
33081   NODE_NAME_CASE(CVTP2UI)
33082   NODE_NAME_CASE(MCVTP2SI)
33083   NODE_NAME_CASE(MCVTP2UI)
33084   NODE_NAME_CASE(CVTP2SI_RND)
33085   NODE_NAME_CASE(CVTP2UI_RND)
33086   NODE_NAME_CASE(CVTS2SI)
33087   NODE_NAME_CASE(CVTS2UI)
33088   NODE_NAME_CASE(CVTS2SI_RND)
33089   NODE_NAME_CASE(CVTS2UI_RND)
33090   NODE_NAME_CASE(CVTNE2PS2BF16)
33091   NODE_NAME_CASE(CVTNEPS2BF16)
33092   NODE_NAME_CASE(MCVTNEPS2BF16)
33093   NODE_NAME_CASE(DPBF16PS)
33094   NODE_NAME_CASE(LWPINS)
33095   NODE_NAME_CASE(MGATHER)
33096   NODE_NAME_CASE(MSCATTER)
33097   NODE_NAME_CASE(VPDPBUSD)
33098   NODE_NAME_CASE(VPDPBUSDS)
33099   NODE_NAME_CASE(VPDPWSSD)
33100   NODE_NAME_CASE(VPDPWSSDS)
33101   NODE_NAME_CASE(VPSHUFBITQMB)
33102   NODE_NAME_CASE(GF2P8MULB)
33103   NODE_NAME_CASE(GF2P8AFFINEQB)
33104   NODE_NAME_CASE(GF2P8AFFINEINVQB)
33105   NODE_NAME_CASE(NT_CALL)
33106   NODE_NAME_CASE(NT_BRIND)
33107   NODE_NAME_CASE(UMWAIT)
33108   NODE_NAME_CASE(TPAUSE)
33109   NODE_NAME_CASE(ENQCMD)
33110   NODE_NAME_CASE(ENQCMDS)
33111   NODE_NAME_CASE(VP2INTERSECT)
33112   NODE_NAME_CASE(AESENC128KL)
33113   NODE_NAME_CASE(AESDEC128KL)
33114   NODE_NAME_CASE(AESENC256KL)
33115   NODE_NAME_CASE(AESDEC256KL)
33116   NODE_NAME_CASE(AESENCWIDE128KL)
33117   NODE_NAME_CASE(AESDECWIDE128KL)
33118   NODE_NAME_CASE(AESENCWIDE256KL)
33119   NODE_NAME_CASE(AESDECWIDE256KL)
33120   NODE_NAME_CASE(TESTUI)
33121   }
33122   return nullptr;
33123 #undef NODE_NAME_CASE
33124 }
33125 
33126 /// Return true if the addressing mode represented by AM is legal for this
33127 /// target, for a load/store of the specified type.
33128 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
33129                                               const AddrMode &AM, Type *Ty,
33130                                               unsigned AS,
33131                                               Instruction *I) const {
33132   // X86 supports extremely general addressing modes.
33133   CodeModel::Model M = getTargetMachine().getCodeModel();
33134 
33135   // X86 allows a sign-extended 32-bit immediate field as a displacement.
33136   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33137     return false;
33138 
33139   if (AM.BaseGV) {
33140     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33141 
33142     // If a reference to this global requires an extra load, we can't fold it.
33143     if (isGlobalStubReference(GVFlags))
33144       return false;
33145 
33146     // If BaseGV requires a register for the PIC base, we cannot also have a
33147     // BaseReg specified.
33148     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33149       return false;
33150 
33151     // If lower 4G is not available, then we must use rip-relative addressing.
33152     if ((M != CodeModel::Small || isPositionIndependent()) &&
33153         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33154       return false;
33155   }
33156 
33157   switch (AM.Scale) {
33158   case 0:
33159   case 1:
33160   case 2:
33161   case 4:
33162   case 8:
33163     // These scales always work.
33164     break;
33165   case 3:
33166   case 5:
33167   case 9:
33168     // These scales are formed with basereg+scalereg.  Only accept if there is
33169     // no basereg yet.
33170     if (AM.HasBaseReg)
33171       return false;
33172     break;
33173   default:  // Other stuff never works.
33174     return false;
33175   }
33176 
33177   return true;
33178 }
33179 
33180 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
33181   unsigned Bits = Ty->getScalarSizeInBits();
33182 
33183   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33184   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33185   if (Subtarget.hasXOP() &&
33186       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33187     return false;
33188 
33189   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33190   // shifts just as cheap as scalar ones.
33191   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33192     return false;
33193 
33194   // AVX512BW has shifts such as vpsllvw.
33195   if (Subtarget.hasBWI() && Bits == 16)
33196     return false;
33197 
33198   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33199   // fully general vector.
33200   return true;
33201 }
33202 
33203 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33204   switch (Opcode) {
33205   // These are non-commutative binops.
33206   // TODO: Add more X86ISD opcodes once we have test coverage.
33207   case X86ISD::ANDNP:
33208   case X86ISD::PCMPGT:
33209   case X86ISD::FMAX:
33210   case X86ISD::FMIN:
33211   case X86ISD::FANDN:
33212   case X86ISD::VPSHA:
33213   case X86ISD::VPSHL:
33214   case X86ISD::VSHLV:
33215   case X86ISD::VSRLV:
33216   case X86ISD::VSRAV:
33217     return true;
33218   }
33219 
33220   return TargetLoweringBase::isBinOp(Opcode);
33221 }
33222 
33223 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33224   switch (Opcode) {
33225   // TODO: Add more X86ISD opcodes once we have test coverage.
33226   case X86ISD::AVG:
33227   case X86ISD::PCMPEQ:
33228   case X86ISD::PMULDQ:
33229   case X86ISD::PMULUDQ:
33230   case X86ISD::FMAXC:
33231   case X86ISD::FMINC:
33232   case X86ISD::FAND:
33233   case X86ISD::FOR:
33234   case X86ISD::FXOR:
33235     return true;
33236   }
33237 
33238   return TargetLoweringBase::isCommutativeBinOp(Opcode);
33239 }
33240 
33241 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
33242   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33243     return false;
33244   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33245   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33246   return NumBits1 > NumBits2;
33247 }
33248 
33249 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
33250   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33251     return false;
33252 
33253   if (!isTypeLegal(EVT::getEVT(Ty1)))
33254     return false;
33255 
33256   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
33257 
33258   // Assuming the caller doesn't have a zeroext or signext return parameter,
33259   // truncation all the way down to i1 is valid.
33260   return true;
33261 }
33262 
33263 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
33264   return isInt<32>(Imm);
33265 }
33266 
33267 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
33268   // Can also use sub to handle negated immediates.
33269   return isInt<32>(Imm);
33270 }
33271 
33272 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
33273   return isInt<32>(Imm);
33274 }
33275 
33276 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
33277   if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
33278     return false;
33279   unsigned NumBits1 = VT1.getSizeInBits();
33280   unsigned NumBits2 = VT2.getSizeInBits();
33281   return NumBits1 > NumBits2;
33282 }
33283 
33284 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
33285   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33286   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
33287 }
33288 
33289 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
33290   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33291   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
33292 }
33293 
33294 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
33295   EVT VT1 = Val.getValueType();
33296   if (isZExtFree(VT1, VT2))
33297     return true;
33298 
33299   if (Val.getOpcode() != ISD::LOAD)
33300     return false;
33301 
33302   if (!VT1.isSimple() || !VT1.isInteger() ||
33303       !VT2.isSimple() || !VT2.isInteger())
33304     return false;
33305 
33306   switch (VT1.getSimpleVT().SimpleTy) {
33307   default: break;
33308   case MVT::i8:
33309   case MVT::i16:
33310   case MVT::i32:
33311     // X86 has 8, 16, and 32-bit zero-extending loads.
33312     return true;
33313   }
33314 
33315   return false;
33316 }
33317 
33318 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
33319                                            SmallVectorImpl<Use *> &Ops) const {
33320   using namespace llvm::PatternMatch;
33321 
33322   FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
33323   if (!VTy)
33324     return false;
33325 
33326   if (I->getOpcode() == Instruction::Mul &&
33327       VTy->getElementType()->isIntegerTy(64)) {
33328     for (auto &Op : I->operands()) {
33329       // Make sure we are not already sinking this operand
33330       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
33331         continue;
33332 
33333       // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
33334       // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
33335       if (Subtarget.hasSSE41() &&
33336           match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
33337                                  m_SpecificInt(32)))) {
33338         Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
33339         Ops.push_back(&Op);
33340       } else if (Subtarget.hasSSE2() &&
33341                  match(Op.get(),
33342                        m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
33343         Ops.push_back(&Op);
33344       }
33345     }
33346 
33347     return !Ops.empty();
33348   }
33349 
33350   // A uniform shift amount in a vector shift or funnel shift may be much
33351   // cheaper than a generic variable vector shift, so make that pattern visible
33352   // to SDAG by sinking the shuffle instruction next to the shift.
33353   int ShiftAmountOpNum = -1;
33354   if (I->isShift())
33355     ShiftAmountOpNum = 1;
33356   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
33357     if (II->getIntrinsicID() == Intrinsic::fshl ||
33358         II->getIntrinsicID() == Intrinsic::fshr)
33359       ShiftAmountOpNum = 2;
33360   }
33361 
33362   if (ShiftAmountOpNum == -1)
33363     return false;
33364 
33365   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
33366   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
33367       isVectorShiftByScalarCheap(I->getType())) {
33368     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
33369     return true;
33370   }
33371 
33372   return false;
33373 }
33374 
33375 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
33376   if (!Subtarget.is64Bit())
33377     return false;
33378   return TargetLowering::shouldConvertPhiType(From, To);
33379 }
33380 
33381 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
33382   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
33383     return false;
33384 
33385   EVT SrcVT = ExtVal.getOperand(0).getValueType();
33386 
33387   // There is no extending load for vXi1.
33388   if (SrcVT.getScalarType() == MVT::i1)
33389     return false;
33390 
33391   return true;
33392 }
33393 
33394 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
33395                                                    EVT VT) const {
33396   if (!Subtarget.hasAnyFMA())
33397     return false;
33398 
33399   VT = VT.getScalarType();
33400 
33401   if (!VT.isSimple())
33402     return false;
33403 
33404   switch (VT.getSimpleVT().SimpleTy) {
33405   case MVT::f16:
33406     return Subtarget.hasFP16();
33407   case MVT::f32:
33408   case MVT::f64:
33409     return true;
33410   default:
33411     break;
33412   }
33413 
33414   return false;
33415 }
33416 
33417 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
33418   // i16 instructions are longer (0x66 prefix) and potentially slower.
33419   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
33420 }
33421 
33422 bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
33423                                                              EVT VT) const {
33424   // TODO: This is too general. There are cases where pre-AVX512 codegen would
33425   //       benefit. The transform may also be profitable for scalar code.
33426   if (!Subtarget.hasAVX512())
33427     return false;
33428   if (!Subtarget.hasVLX() && !VT.is512BitVector())
33429     return false;
33430   if (!VT.isVector())
33431     return false;
33432 
33433   return true;
33434 }
33435 
33436 /// Targets can use this to indicate that they only support *some*
33437 /// VECTOR_SHUFFLE operations, those with specific masks.
33438 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
33439 /// are assumed to be legal.
33440 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
33441   if (!VT.isSimple())
33442     return false;
33443 
33444   // Not for i1 vectors
33445   if (VT.getSimpleVT().getScalarType() == MVT::i1)
33446     return false;
33447 
33448   // Very little shuffling can be done for 64-bit vectors right now.
33449   if (VT.getSimpleVT().getSizeInBits() == 64)
33450     return false;
33451 
33452   // We only care that the types being shuffled are legal. The lowering can
33453   // handle any possible shuffle mask that results.
33454   return isTypeLegal(VT.getSimpleVT());
33455 }
33456 
33457 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
33458                                                EVT VT) const {
33459   // Don't convert an 'and' into a shuffle that we don't directly support.
33460   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
33461   if (!Subtarget.hasAVX2())
33462     if (VT == MVT::v32i8 || VT == MVT::v16i16)
33463       return false;
33464 
33465   // Just delegate to the generic legality, clear masks aren't special.
33466   return isShuffleMaskLegal(Mask, VT);
33467 }
33468 
33469 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
33470   // If the subtarget is using thunks, we need to not generate jump tables.
33471   if (Subtarget.useIndirectThunkBranches())
33472     return false;
33473 
33474   // Otherwise, fallback on the generic logic.
33475   return TargetLowering::areJTsAllowed(Fn);
33476 }
33477 
33478 //===----------------------------------------------------------------------===//
33479 //                           X86 Scheduler Hooks
33480 //===----------------------------------------------------------------------===//
33481 
33482 // Returns true if EFLAG is consumed after this iterator in the rest of the
33483 // basic block or any successors of the basic block.
33484 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
33485                               MachineBasicBlock *BB) {
33486   // Scan forward through BB for a use/def of EFLAGS.
33487   for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
33488     if (mi.readsRegister(X86::EFLAGS))
33489       return true;
33490     // If we found a def, we can stop searching.
33491     if (mi.definesRegister(X86::EFLAGS))
33492       return false;
33493   }
33494 
33495   // If we hit the end of the block, check whether EFLAGS is live into a
33496   // successor.
33497   for (MachineBasicBlock *Succ : BB->successors())
33498     if (Succ->isLiveIn(X86::EFLAGS))
33499       return true;
33500 
33501   return false;
33502 }
33503 
33504 /// Utility function to emit xbegin specifying the start of an RTM region.
33505 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
33506                                      const TargetInstrInfo *TII) {
33507   const DebugLoc &DL = MI.getDebugLoc();
33508 
33509   const BasicBlock *BB = MBB->getBasicBlock();
33510   MachineFunction::iterator I = ++MBB->getIterator();
33511 
33512   // For the v = xbegin(), we generate
33513   //
33514   // thisMBB:
33515   //  xbegin sinkMBB
33516   //
33517   // mainMBB:
33518   //  s0 = -1
33519   //
33520   // fallBB:
33521   //  eax = # XABORT_DEF
33522   //  s1 = eax
33523   //
33524   // sinkMBB:
33525   //  v = phi(s0/mainBB, s1/fallBB)
33526 
33527   MachineBasicBlock *thisMBB = MBB;
33528   MachineFunction *MF = MBB->getParent();
33529   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33530   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33531   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33532   MF->insert(I, mainMBB);
33533   MF->insert(I, fallMBB);
33534   MF->insert(I, sinkMBB);
33535 
33536   if (isEFLAGSLiveAfter(MI, MBB)) {
33537     mainMBB->addLiveIn(X86::EFLAGS);
33538     fallMBB->addLiveIn(X86::EFLAGS);
33539     sinkMBB->addLiveIn(X86::EFLAGS);
33540   }
33541 
33542   // Transfer the remainder of BB and its successor edges to sinkMBB.
33543   sinkMBB->splice(sinkMBB->begin(), MBB,
33544                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33545   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33546 
33547   MachineRegisterInfo &MRI = MF->getRegInfo();
33548   Register DstReg = MI.getOperand(0).getReg();
33549   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33550   Register mainDstReg = MRI.createVirtualRegister(RC);
33551   Register fallDstReg = MRI.createVirtualRegister(RC);
33552 
33553   // thisMBB:
33554   //  xbegin fallMBB
33555   //  # fallthrough to mainMBB
33556   //  # abortion to fallMBB
33557   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
33558   thisMBB->addSuccessor(mainMBB);
33559   thisMBB->addSuccessor(fallMBB);
33560 
33561   // mainMBB:
33562   //  mainDstReg := -1
33563   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
33564   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33565   mainMBB->addSuccessor(sinkMBB);
33566 
33567   // fallMBB:
33568   //  ; pseudo instruction to model hardware's definition from XABORT
33569   //  EAX := XABORT_DEF
33570   //  fallDstReg := EAX
33571   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
33572   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
33573       .addReg(X86::EAX);
33574   fallMBB->addSuccessor(sinkMBB);
33575 
33576   // sinkMBB:
33577   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
33578   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
33579       .addReg(mainDstReg).addMBB(mainMBB)
33580       .addReg(fallDstReg).addMBB(fallMBB);
33581 
33582   MI.eraseFromParent();
33583   return sinkMBB;
33584 }
33585 
33586 MachineBasicBlock *
33587 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
33588                                                MachineBasicBlock *MBB) const {
33589   // Emit va_arg instruction on X86-64.
33590 
33591   // Operands to this pseudo-instruction:
33592   // 0  ) Output        : destination address (reg)
33593   // 1-5) Input         : va_list address (addr, i64mem)
33594   // 6  ) ArgSize       : Size (in bytes) of vararg type
33595   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
33596   // 8  ) Align         : Alignment of type
33597   // 9  ) EFLAGS (implicit-def)
33598 
33599   assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
33600   static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
33601 
33602   Register DestReg = MI.getOperand(0).getReg();
33603   MachineOperand &Base = MI.getOperand(1);
33604   MachineOperand &Scale = MI.getOperand(2);
33605   MachineOperand &Index = MI.getOperand(3);
33606   MachineOperand &Disp = MI.getOperand(4);
33607   MachineOperand &Segment = MI.getOperand(5);
33608   unsigned ArgSize = MI.getOperand(6).getImm();
33609   unsigned ArgMode = MI.getOperand(7).getImm();
33610   Align Alignment = Align(MI.getOperand(8).getImm());
33611 
33612   MachineFunction *MF = MBB->getParent();
33613 
33614   // Memory Reference
33615   assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
33616 
33617   MachineMemOperand *OldMMO = MI.memoperands().front();
33618 
33619   // Clone the MMO into two separate MMOs for loading and storing
33620   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
33621       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
33622   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
33623       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
33624 
33625   // Machine Information
33626   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33627   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
33628   const TargetRegisterClass *AddrRegClass =
33629       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
33630   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
33631   const DebugLoc &DL = MI.getDebugLoc();
33632 
33633   // struct va_list {
33634   //   i32   gp_offset
33635   //   i32   fp_offset
33636   //   i64   overflow_area (address)
33637   //   i64   reg_save_area (address)
33638   // }
33639   // sizeof(va_list) = 24
33640   // alignment(va_list) = 8
33641 
33642   unsigned TotalNumIntRegs = 6;
33643   unsigned TotalNumXMMRegs = 8;
33644   bool UseGPOffset = (ArgMode == 1);
33645   bool UseFPOffset = (ArgMode == 2);
33646   unsigned MaxOffset = TotalNumIntRegs * 8 +
33647                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
33648 
33649   /* Align ArgSize to a multiple of 8 */
33650   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
33651   bool NeedsAlign = (Alignment > 8);
33652 
33653   MachineBasicBlock *thisMBB = MBB;
33654   MachineBasicBlock *overflowMBB;
33655   MachineBasicBlock *offsetMBB;
33656   MachineBasicBlock *endMBB;
33657 
33658   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
33659   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
33660   unsigned OffsetReg = 0;
33661 
33662   if (!UseGPOffset && !UseFPOffset) {
33663     // If we only pull from the overflow region, we don't create a branch.
33664     // We don't need to alter control flow.
33665     OffsetDestReg = 0; // unused
33666     OverflowDestReg = DestReg;
33667 
33668     offsetMBB = nullptr;
33669     overflowMBB = thisMBB;
33670     endMBB = thisMBB;
33671   } else {
33672     // First emit code to check if gp_offset (or fp_offset) is below the bound.
33673     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33674     // If not, pull from overflow_area. (branch to overflowMBB)
33675     //
33676     //       thisMBB
33677     //         |     .
33678     //         |        .
33679     //     offsetMBB   overflowMBB
33680     //         |        .
33681     //         |     .
33682     //        endMBB
33683 
33684     // Registers for the PHI in endMBB
33685     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33686     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33687 
33688     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33689     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33690     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33691     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33692 
33693     MachineFunction::iterator MBBIter = ++MBB->getIterator();
33694 
33695     // Insert the new basic blocks
33696     MF->insert(MBBIter, offsetMBB);
33697     MF->insert(MBBIter, overflowMBB);
33698     MF->insert(MBBIter, endMBB);
33699 
33700     // Transfer the remainder of MBB and its successor edges to endMBB.
33701     endMBB->splice(endMBB->begin(), thisMBB,
33702                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
33703     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
33704 
33705     // Make offsetMBB and overflowMBB successors of thisMBB
33706     thisMBB->addSuccessor(offsetMBB);
33707     thisMBB->addSuccessor(overflowMBB);
33708 
33709     // endMBB is a successor of both offsetMBB and overflowMBB
33710     offsetMBB->addSuccessor(endMBB);
33711     overflowMBB->addSuccessor(endMBB);
33712 
33713     // Load the offset value into a register
33714     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33715     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
33716         .add(Base)
33717         .add(Scale)
33718         .add(Index)
33719         .addDisp(Disp, UseFPOffset ? 4 : 0)
33720         .add(Segment)
33721         .setMemRefs(LoadOnlyMMO);
33722 
33723     // Check if there is enough room left to pull this argument.
33724     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
33725       .addReg(OffsetReg)
33726       .addImm(MaxOffset + 8 - ArgSizeA8);
33727 
33728     // Branch to "overflowMBB" if offset >= max
33729     // Fall through to "offsetMBB" otherwise
33730     BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
33731       .addMBB(overflowMBB).addImm(X86::COND_AE);
33732   }
33733 
33734   // In offsetMBB, emit code to use the reg_save_area.
33735   if (offsetMBB) {
33736     assert(OffsetReg != 0);
33737 
33738     // Read the reg_save_area address.
33739     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
33740     BuildMI(
33741         offsetMBB, DL,
33742         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33743         RegSaveReg)
33744         .add(Base)
33745         .add(Scale)
33746         .add(Index)
33747         .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
33748         .add(Segment)
33749         .setMemRefs(LoadOnlyMMO);
33750 
33751     if (Subtarget.isTarget64BitLP64()) {
33752       // Zero-extend the offset
33753       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
33754       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
33755           .addImm(0)
33756           .addReg(OffsetReg)
33757           .addImm(X86::sub_32bit);
33758 
33759       // Add the offset to the reg_save_area to get the final address.
33760       BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
33761           .addReg(OffsetReg64)
33762           .addReg(RegSaveReg);
33763     } else {
33764       // Add the offset to the reg_save_area to get the final address.
33765       BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
33766           .addReg(OffsetReg)
33767           .addReg(RegSaveReg);
33768     }
33769 
33770     // Compute the offset for the next argument
33771     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33772     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
33773       .addReg(OffsetReg)
33774       .addImm(UseFPOffset ? 16 : 8);
33775 
33776     // Store it back into the va_list.
33777     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
33778         .add(Base)
33779         .add(Scale)
33780         .add(Index)
33781         .addDisp(Disp, UseFPOffset ? 4 : 0)
33782         .add(Segment)
33783         .addReg(NextOffsetReg)
33784         .setMemRefs(StoreOnlyMMO);
33785 
33786     // Jump to endMBB
33787     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
33788       .addMBB(endMBB);
33789   }
33790 
33791   //
33792   // Emit code to use overflow area
33793   //
33794 
33795   // Load the overflow_area address into a register.
33796   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
33797   BuildMI(overflowMBB, DL,
33798           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33799           OverflowAddrReg)
33800       .add(Base)
33801       .add(Scale)
33802       .add(Index)
33803       .addDisp(Disp, 8)
33804       .add(Segment)
33805       .setMemRefs(LoadOnlyMMO);
33806 
33807   // If we need to align it, do so. Otherwise, just copy the address
33808   // to OverflowDestReg.
33809   if (NeedsAlign) {
33810     // Align the overflow address
33811     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
33812 
33813     // aligned_addr = (addr + (align-1)) & ~(align-1)
33814     BuildMI(
33815         overflowMBB, DL,
33816         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
33817         TmpReg)
33818         .addReg(OverflowAddrReg)
33819         .addImm(Alignment.value() - 1);
33820 
33821     BuildMI(
33822         overflowMBB, DL,
33823         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
33824         OverflowDestReg)
33825         .addReg(TmpReg)
33826         .addImm(~(uint64_t)(Alignment.value() - 1));
33827   } else {
33828     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
33829       .addReg(OverflowAddrReg);
33830   }
33831 
33832   // Compute the next overflow address after this argument.
33833   // (the overflow address should be kept 8-byte aligned)
33834   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
33835   BuildMI(
33836       overflowMBB, DL,
33837       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
33838       NextAddrReg)
33839       .addReg(OverflowDestReg)
33840       .addImm(ArgSizeA8);
33841 
33842   // Store the new overflow address.
33843   BuildMI(overflowMBB, DL,
33844           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
33845       .add(Base)
33846       .add(Scale)
33847       .add(Index)
33848       .addDisp(Disp, 8)
33849       .add(Segment)
33850       .addReg(NextAddrReg)
33851       .setMemRefs(StoreOnlyMMO);
33852 
33853   // If we branched, emit the PHI to the front of endMBB.
33854   if (offsetMBB) {
33855     BuildMI(*endMBB, endMBB->begin(), DL,
33856             TII->get(X86::PHI), DestReg)
33857       .addReg(OffsetDestReg).addMBB(offsetMBB)
33858       .addReg(OverflowDestReg).addMBB(overflowMBB);
33859   }
33860 
33861   // Erase the pseudo instruction
33862   MI.eraseFromParent();
33863 
33864   return endMBB;
33865 }
33866 
33867 // The EFLAGS operand of SelectItr might be missing a kill marker
33868 // because there were multiple uses of EFLAGS, and ISel didn't know
33869 // which to mark. Figure out whether SelectItr should have had a
33870 // kill marker, and set it if it should. Returns the correct kill
33871 // marker value.
33872 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
33873                                      MachineBasicBlock* BB,
33874                                      const TargetRegisterInfo* TRI) {
33875   if (isEFLAGSLiveAfter(SelectItr, BB))
33876     return false;
33877 
33878   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
33879   // out. SelectMI should have a kill flag on EFLAGS.
33880   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
33881   return true;
33882 }
33883 
33884 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
33885 // together with other CMOV pseudo-opcodes into a single basic-block with
33886 // conditional jump around it.
33887 static bool isCMOVPseudo(MachineInstr &MI) {
33888   switch (MI.getOpcode()) {
33889   case X86::CMOV_FR16X:
33890   case X86::CMOV_FR32:
33891   case X86::CMOV_FR32X:
33892   case X86::CMOV_FR64:
33893   case X86::CMOV_FR64X:
33894   case X86::CMOV_GR8:
33895   case X86::CMOV_GR16:
33896   case X86::CMOV_GR32:
33897   case X86::CMOV_RFP32:
33898   case X86::CMOV_RFP64:
33899   case X86::CMOV_RFP80:
33900   case X86::CMOV_VR64:
33901   case X86::CMOV_VR128:
33902   case X86::CMOV_VR128X:
33903   case X86::CMOV_VR256:
33904   case X86::CMOV_VR256X:
33905   case X86::CMOV_VR512:
33906   case X86::CMOV_VK1:
33907   case X86::CMOV_VK2:
33908   case X86::CMOV_VK4:
33909   case X86::CMOV_VK8:
33910   case X86::CMOV_VK16:
33911   case X86::CMOV_VK32:
33912   case X86::CMOV_VK64:
33913     return true;
33914 
33915   default:
33916     return false;
33917   }
33918 }
33919 
33920 // Helper function, which inserts PHI functions into SinkMBB:
33921 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
33922 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
33923 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
33924 // the last PHI function inserted.
33925 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
33926     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
33927     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
33928     MachineBasicBlock *SinkMBB) {
33929   MachineFunction *MF = TrueMBB->getParent();
33930   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
33931   const DebugLoc &DL = MIItBegin->getDebugLoc();
33932 
33933   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
33934   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
33935 
33936   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
33937 
33938   // As we are creating the PHIs, we have to be careful if there is more than
33939   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
33940   // PHIs have to reference the individual true/false inputs from earlier PHIs.
33941   // That also means that PHI construction must work forward from earlier to
33942   // later, and that the code must maintain a mapping from earlier PHI's
33943   // destination registers, and the registers that went into the PHI.
33944   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
33945   MachineInstrBuilder MIB;
33946 
33947   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
33948     Register DestReg = MIIt->getOperand(0).getReg();
33949     Register Op1Reg = MIIt->getOperand(1).getReg();
33950     Register Op2Reg = MIIt->getOperand(2).getReg();
33951 
33952     // If this CMOV we are generating is the opposite condition from
33953     // the jump we generated, then we have to swap the operands for the
33954     // PHI that is going to be generated.
33955     if (MIIt->getOperand(3).getImm() == OppCC)
33956       std::swap(Op1Reg, Op2Reg);
33957 
33958     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
33959       Op1Reg = RegRewriteTable[Op1Reg].first;
33960 
33961     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
33962       Op2Reg = RegRewriteTable[Op2Reg].second;
33963 
33964     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
33965               .addReg(Op1Reg)
33966               .addMBB(FalseMBB)
33967               .addReg(Op2Reg)
33968               .addMBB(TrueMBB);
33969 
33970     // Add this PHI to the rewrite table.
33971     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
33972   }
33973 
33974   return MIB;
33975 }
33976 
33977 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
33978 MachineBasicBlock *
33979 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
33980                                              MachineInstr &SecondCascadedCMOV,
33981                                              MachineBasicBlock *ThisMBB) const {
33982   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33983   const DebugLoc &DL = FirstCMOV.getDebugLoc();
33984 
33985   // We lower cascaded CMOVs such as
33986   //
33987   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
33988   //
33989   // to two successive branches.
33990   //
33991   // Without this, we would add a PHI between the two jumps, which ends up
33992   // creating a few copies all around. For instance, for
33993   //
33994   //    (sitofp (zext (fcmp une)))
33995   //
33996   // we would generate:
33997   //
33998   //         ucomiss %xmm1, %xmm0
33999   //         movss  <1.0f>, %xmm0
34000   //         movaps  %xmm0, %xmm1
34001   //         jne     .LBB5_2
34002   //         xorps   %xmm1, %xmm1
34003   // .LBB5_2:
34004   //         jp      .LBB5_4
34005   //         movaps  %xmm1, %xmm0
34006   // .LBB5_4:
34007   //         retq
34008   //
34009   // because this custom-inserter would have generated:
34010   //
34011   //   A
34012   //   | \
34013   //   |  B
34014   //   | /
34015   //   C
34016   //   | \
34017   //   |  D
34018   //   | /
34019   //   E
34020   //
34021   // A: X = ...; Y = ...
34022   // B: empty
34023   // C: Z = PHI [X, A], [Y, B]
34024   // D: empty
34025   // E: PHI [X, C], [Z, D]
34026   //
34027   // If we lower both CMOVs in a single step, we can instead generate:
34028   //
34029   //   A
34030   //   | \
34031   //   |  C
34032   //   | /|
34033   //   |/ |
34034   //   |  |
34035   //   |  D
34036   //   | /
34037   //   E
34038   //
34039   // A: X = ...; Y = ...
34040   // D: empty
34041   // E: PHI [X, A], [X, C], [Y, D]
34042   //
34043   // Which, in our sitofp/fcmp example, gives us something like:
34044   //
34045   //         ucomiss %xmm1, %xmm0
34046   //         movss  <1.0f>, %xmm0
34047   //         jne     .LBB5_4
34048   //         jp      .LBB5_4
34049   //         xorps   %xmm0, %xmm0
34050   // .LBB5_4:
34051   //         retq
34052   //
34053 
34054   // We lower cascaded CMOV into two successive branches to the same block.
34055   // EFLAGS is used by both, so mark it as live in the second.
34056   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34057   MachineFunction *F = ThisMBB->getParent();
34058   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34059   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34060   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34061 
34062   MachineFunction::iterator It = ++ThisMBB->getIterator();
34063   F->insert(It, FirstInsertedMBB);
34064   F->insert(It, SecondInsertedMBB);
34065   F->insert(It, SinkMBB);
34066 
34067   // For a cascaded CMOV, we lower it to two successive branches to
34068   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
34069   // the FirstInsertedMBB.
34070   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34071 
34072   // If the EFLAGS register isn't dead in the terminator, then claim that it's
34073   // live into the sink and copy blocks.
34074   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34075   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
34076       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34077     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34078     SinkMBB->addLiveIn(X86::EFLAGS);
34079   }
34080 
34081   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34082   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34083                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
34084                   ThisMBB->end());
34085   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34086 
34087   // Fallthrough block for ThisMBB.
34088   ThisMBB->addSuccessor(FirstInsertedMBB);
34089   // The true block target of the first branch is always SinkMBB.
34090   ThisMBB->addSuccessor(SinkMBB);
34091   // Fallthrough block for FirstInsertedMBB.
34092   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34093   // The true block for the branch of FirstInsertedMBB.
34094   FirstInsertedMBB->addSuccessor(SinkMBB);
34095   // This is fallthrough.
34096   SecondInsertedMBB->addSuccessor(SinkMBB);
34097 
34098   // Create the conditional branch instructions.
34099   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34100   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34101 
34102   X86::CondCode SecondCC =
34103       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34104   BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
34105 
34106   //  SinkMBB:
34107   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34108   Register DestReg = FirstCMOV.getOperand(0).getReg();
34109   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34110   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34111   MachineInstrBuilder MIB =
34112       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
34113           .addReg(Op1Reg)
34114           .addMBB(SecondInsertedMBB)
34115           .addReg(Op2Reg)
34116           .addMBB(ThisMBB);
34117 
34118   // The second SecondInsertedMBB provides the same incoming value as the
34119   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34120   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34121   // Copy the PHI result to the register defined by the second CMOV.
34122   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
34123           TII->get(TargetOpcode::COPY),
34124           SecondCascadedCMOV.getOperand(0).getReg())
34125       .addReg(FirstCMOV.getOperand(0).getReg());
34126 
34127   // Now remove the CMOVs.
34128   FirstCMOV.eraseFromParent();
34129   SecondCascadedCMOV.eraseFromParent();
34130 
34131   return SinkMBB;
34132 }
34133 
34134 MachineBasicBlock *
34135 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34136                                      MachineBasicBlock *ThisMBB) const {
34137   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34138   const DebugLoc &DL = MI.getDebugLoc();
34139 
34140   // To "insert" a SELECT_CC instruction, we actually have to insert the
34141   // diamond control-flow pattern.  The incoming instruction knows the
34142   // destination vreg to set, the condition code register to branch on, the
34143   // true/false values to select between and a branch opcode to use.
34144 
34145   //  ThisMBB:
34146   //  ...
34147   //   TrueVal = ...
34148   //   cmpTY ccX, r1, r2
34149   //   bCC copy1MBB
34150   //   fallthrough --> FalseMBB
34151 
34152   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34153   // as described above, by inserting a BB, and then making a PHI at the join
34154   // point to select the true and false operands of the CMOV in the PHI.
34155   //
34156   // The code also handles two different cases of multiple CMOV opcodes
34157   // in a row.
34158   //
34159   // Case 1:
34160   // In this case, there are multiple CMOVs in a row, all which are based on
34161   // the same condition setting (or the exact opposite condition setting).
34162   // In this case we can lower all the CMOVs using a single inserted BB, and
34163   // then make a number of PHIs at the join point to model the CMOVs. The only
34164   // trickiness here, is that in a case like:
34165   //
34166   // t2 = CMOV cond1 t1, f1
34167   // t3 = CMOV cond1 t2, f2
34168   //
34169   // when rewriting this into PHIs, we have to perform some renaming on the
34170   // temps since you cannot have a PHI operand refer to a PHI result earlier
34171   // in the same block.  The "simple" but wrong lowering would be:
34172   //
34173   // t2 = PHI t1(BB1), f1(BB2)
34174   // t3 = PHI t2(BB1), f2(BB2)
34175   //
34176   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34177   // renaming is to note that on the path through BB1, t2 is really just a
34178   // copy of t1, and do that renaming, properly generating:
34179   //
34180   // t2 = PHI t1(BB1), f1(BB2)
34181   // t3 = PHI t1(BB1), f2(BB2)
34182   //
34183   // Case 2:
34184   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34185   // function - EmitLoweredCascadedSelect.
34186 
34187   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34188   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34189   MachineInstr *LastCMOV = &MI;
34190   MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
34191 
34192   // Check for case 1, where there are multiple CMOVs with the same condition
34193   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
34194   // number of jumps the most.
34195 
34196   if (isCMOVPseudo(MI)) {
34197     // See if we have a string of CMOVS with the same condition. Skip over
34198     // intervening debug insts.
34199     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34200            (NextMIIt->getOperand(3).getImm() == CC ||
34201             NextMIIt->getOperand(3).getImm() == OppCC)) {
34202       LastCMOV = &*NextMIIt;
34203       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34204     }
34205   }
34206 
34207   // This checks for case 2, but only do this if we didn't already find
34208   // case 1, as indicated by LastCMOV == MI.
34209   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34210       NextMIIt->getOpcode() == MI.getOpcode() &&
34211       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34212       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34213       NextMIIt->getOperand(1).isKill()) {
34214     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34215   }
34216 
34217   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34218   MachineFunction *F = ThisMBB->getParent();
34219   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34220   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34221 
34222   MachineFunction::iterator It = ++ThisMBB->getIterator();
34223   F->insert(It, FalseMBB);
34224   F->insert(It, SinkMBB);
34225 
34226   // If the EFLAGS register isn't dead in the terminator, then claim that it's
34227   // live into the sink and copy blocks.
34228   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34229   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
34230       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34231     FalseMBB->addLiveIn(X86::EFLAGS);
34232     SinkMBB->addLiveIn(X86::EFLAGS);
34233   }
34234 
34235   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
34236   auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
34237                                    MachineBasicBlock::iterator(LastCMOV));
34238   for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
34239     if (MI.isDebugInstr())
34240       SinkMBB->push_back(MI.removeFromParent());
34241 
34242   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34243   SinkMBB->splice(SinkMBB->end(), ThisMBB,
34244                   std::next(MachineBasicBlock::iterator(LastCMOV)),
34245                   ThisMBB->end());
34246   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34247 
34248   // Fallthrough block for ThisMBB.
34249   ThisMBB->addSuccessor(FalseMBB);
34250   // The true block target of the first (or only) branch is always a SinkMBB.
34251   ThisMBB->addSuccessor(SinkMBB);
34252   // Fallthrough block for FalseMBB.
34253   FalseMBB->addSuccessor(SinkMBB);
34254 
34255   // Create the conditional branch instruction.
34256   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
34257 
34258   //  SinkMBB:
34259   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
34260   //  ...
34261   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
34262   MachineBasicBlock::iterator MIItEnd =
34263       std::next(MachineBasicBlock::iterator(LastCMOV));
34264   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
34265 
34266   // Now remove the CMOV(s).
34267   ThisMBB->erase(MIItBegin, MIItEnd);
34268 
34269   return SinkMBB;
34270 }
34271 
34272 static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
34273   if (IsLP64) {
34274     if (isInt<8>(Imm))
34275       return X86::SUB64ri8;
34276     return X86::SUB64ri32;
34277   } else {
34278     if (isInt<8>(Imm))
34279       return X86::SUB32ri8;
34280     return X86::SUB32ri;
34281   }
34282 }
34283 
34284 MachineBasicBlock *
34285 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
34286                                            MachineBasicBlock *MBB) const {
34287   MachineFunction *MF = MBB->getParent();
34288   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34289   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
34290   const DebugLoc &DL = MI.getDebugLoc();
34291   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34292 
34293   const unsigned ProbeSize = getStackProbeSize(*MF);
34294 
34295   MachineRegisterInfo &MRI = MF->getRegInfo();
34296   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34297   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34298   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34299 
34300   MachineFunction::iterator MBBIter = ++MBB->getIterator();
34301   MF->insert(MBBIter, testMBB);
34302   MF->insert(MBBIter, blockMBB);
34303   MF->insert(MBBIter, tailMBB);
34304 
34305   Register sizeVReg = MI.getOperand(1).getReg();
34306 
34307   Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
34308 
34309   Register TmpStackPtr = MRI.createVirtualRegister(
34310       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34311   Register FinalStackPtr = MRI.createVirtualRegister(
34312       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34313 
34314   BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
34315       .addReg(physSPReg);
34316   {
34317     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
34318     BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
34319         .addReg(TmpStackPtr)
34320         .addReg(sizeVReg);
34321   }
34322 
34323   // test rsp size
34324 
34325   BuildMI(testMBB, DL,
34326           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
34327       .addReg(FinalStackPtr)
34328       .addReg(physSPReg);
34329 
34330   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
34331       .addMBB(tailMBB)
34332       .addImm(X86::COND_GE);
34333   testMBB->addSuccessor(blockMBB);
34334   testMBB->addSuccessor(tailMBB);
34335 
34336   // Touch the block then extend it. This is done on the opposite side of
34337   // static probe where we allocate then touch, to avoid the need of probing the
34338   // tail of the static alloca. Possible scenarios are:
34339   //
34340   //       + ---- <- ------------ <- ------------- <- ------------ +
34341   //       |                                                       |
34342   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
34343   //                                                               |                                                               |
34344   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
34345   //
34346   // The property we want to enforce is to never have more than [page alloc] between two probes.
34347 
34348   const unsigned XORMIOpc =
34349       TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
34350   addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
34351       .addImm(0);
34352 
34353   BuildMI(blockMBB, DL,
34354           TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
34355       .addReg(physSPReg)
34356       .addImm(ProbeSize);
34357 
34358 
34359   BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
34360   blockMBB->addSuccessor(testMBB);
34361 
34362   // Replace original instruction by the expected stack ptr
34363   BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
34364       .addReg(FinalStackPtr);
34365 
34366   tailMBB->splice(tailMBB->end(), MBB,
34367                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34368   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
34369   MBB->addSuccessor(testMBB);
34370 
34371   // Delete the original pseudo instruction.
34372   MI.eraseFromParent();
34373 
34374   // And we're done.
34375   return tailMBB;
34376 }
34377 
34378 MachineBasicBlock *
34379 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
34380                                         MachineBasicBlock *BB) const {
34381   MachineFunction *MF = BB->getParent();
34382   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34383   const DebugLoc &DL = MI.getDebugLoc();
34384   const BasicBlock *LLVM_BB = BB->getBasicBlock();
34385 
34386   assert(MF->shouldSplitStack());
34387 
34388   const bool Is64Bit = Subtarget.is64Bit();
34389   const bool IsLP64 = Subtarget.isTarget64BitLP64();
34390 
34391   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
34392   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
34393 
34394   // BB:
34395   //  ... [Till the alloca]
34396   // If stacklet is not large enough, jump to mallocMBB
34397   //
34398   // bumpMBB:
34399   //  Allocate by subtracting from RSP
34400   //  Jump to continueMBB
34401   //
34402   // mallocMBB:
34403   //  Allocate by call to runtime
34404   //
34405   // continueMBB:
34406   //  ...
34407   //  [rest of original BB]
34408   //
34409 
34410   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34411   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34412   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34413 
34414   MachineRegisterInfo &MRI = MF->getRegInfo();
34415   const TargetRegisterClass *AddrRegClass =
34416       getRegClassFor(getPointerTy(MF->getDataLayout()));
34417 
34418   Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34419            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34420            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
34421            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
34422            sizeVReg = MI.getOperand(1).getReg(),
34423            physSPReg =
34424                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
34425 
34426   MachineFunction::iterator MBBIter = ++BB->getIterator();
34427 
34428   MF->insert(MBBIter, bumpMBB);
34429   MF->insert(MBBIter, mallocMBB);
34430   MF->insert(MBBIter, continueMBB);
34431 
34432   continueMBB->splice(continueMBB->begin(), BB,
34433                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
34434   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
34435 
34436   // Add code to the main basic block to check if the stack limit has been hit,
34437   // and if so, jump to mallocMBB otherwise to bumpMBB.
34438   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
34439   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
34440     .addReg(tmpSPVReg).addReg(sizeVReg);
34441   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
34442     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
34443     .addReg(SPLimitVReg);
34444   BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
34445 
34446   // bumpMBB simply decreases the stack pointer, since we know the current
34447   // stacklet has enough space.
34448   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
34449     .addReg(SPLimitVReg);
34450   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
34451     .addReg(SPLimitVReg);
34452   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
34453 
34454   // Calls into a routine in libgcc to allocate more space from the heap.
34455   const uint32_t *RegMask =
34456       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
34457   if (IsLP64) {
34458     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
34459       .addReg(sizeVReg);
34460     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
34461       .addExternalSymbol("__morestack_allocate_stack_space")
34462       .addRegMask(RegMask)
34463       .addReg(X86::RDI, RegState::Implicit)
34464       .addReg(X86::RAX, RegState::ImplicitDefine);
34465   } else if (Is64Bit) {
34466     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
34467       .addReg(sizeVReg);
34468     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
34469       .addExternalSymbol("__morestack_allocate_stack_space")
34470       .addRegMask(RegMask)
34471       .addReg(X86::EDI, RegState::Implicit)
34472       .addReg(X86::EAX, RegState::ImplicitDefine);
34473   } else {
34474     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
34475       .addImm(12);
34476     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
34477     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
34478       .addExternalSymbol("__morestack_allocate_stack_space")
34479       .addRegMask(RegMask)
34480       .addReg(X86::EAX, RegState::ImplicitDefine);
34481   }
34482 
34483   if (!Is64Bit)
34484     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
34485       .addImm(16);
34486 
34487   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
34488     .addReg(IsLP64 ? X86::RAX : X86::EAX);
34489   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
34490 
34491   // Set up the CFG correctly.
34492   BB->addSuccessor(bumpMBB);
34493   BB->addSuccessor(mallocMBB);
34494   mallocMBB->addSuccessor(continueMBB);
34495   bumpMBB->addSuccessor(continueMBB);
34496 
34497   // Take care of the PHI nodes.
34498   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
34499           MI.getOperand(0).getReg())
34500       .addReg(mallocPtrVReg)
34501       .addMBB(mallocMBB)
34502       .addReg(bumpSPPtrVReg)
34503       .addMBB(bumpMBB);
34504 
34505   // Delete the original pseudo instruction.
34506   MI.eraseFromParent();
34507 
34508   // And we're done.
34509   return continueMBB;
34510 }
34511 
34512 MachineBasicBlock *
34513 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
34514                                        MachineBasicBlock *BB) const {
34515   MachineFunction *MF = BB->getParent();
34516   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34517   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
34518   const DebugLoc &DL = MI.getDebugLoc();
34519 
34520   assert(!isAsynchronousEHPersonality(
34521              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
34522          "SEH does not use catchret!");
34523 
34524   // Only 32-bit EH needs to worry about manually restoring stack pointers.
34525   if (!Subtarget.is32Bit())
34526     return BB;
34527 
34528   // C++ EH creates a new target block to hold the restore code, and wires up
34529   // the new block to the return destination with a normal JMP_4.
34530   MachineBasicBlock *RestoreMBB =
34531       MF->CreateMachineBasicBlock(BB->getBasicBlock());
34532   assert(BB->succ_size() == 1);
34533   MF->insert(std::next(BB->getIterator()), RestoreMBB);
34534   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
34535   BB->addSuccessor(RestoreMBB);
34536   MI.getOperand(0).setMBB(RestoreMBB);
34537 
34538   // Marking this as an EH pad but not a funclet entry block causes PEI to
34539   // restore stack pointers in the block.
34540   RestoreMBB->setIsEHPad(true);
34541 
34542   auto RestoreMBBI = RestoreMBB->begin();
34543   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
34544   return BB;
34545 }
34546 
34547 MachineBasicBlock *
34548 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
34549                                       MachineBasicBlock *BB) const {
34550   // So, here we replace TLSADDR with the sequence:
34551   // adjust_stackdown -> TLSADDR -> adjust_stackup.
34552   // We need this because TLSADDR is lowered into calls
34553   // inside MC, therefore without the two markers shrink-wrapping
34554   // may push the prologue/epilogue pass them.
34555   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34556   const DebugLoc &DL = MI.getDebugLoc();
34557   MachineFunction &MF = *BB->getParent();
34558 
34559   // Emit CALLSEQ_START right before the instruction.
34560   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
34561   MachineInstrBuilder CallseqStart =
34562     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
34563   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
34564 
34565   // Emit CALLSEQ_END right after the instruction.
34566   // We don't call erase from parent because we want to keep the
34567   // original instruction around.
34568   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
34569   MachineInstrBuilder CallseqEnd =
34570     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
34571   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
34572 
34573   return BB;
34574 }
34575 
34576 MachineBasicBlock *
34577 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
34578                                       MachineBasicBlock *BB) const {
34579   // This is pretty easy.  We're taking the value that we received from
34580   // our load from the relocation, sticking it in either RDI (x86-64)
34581   // or EAX and doing an indirect call.  The return value will then
34582   // be in the normal return register.
34583   MachineFunction *F = BB->getParent();
34584   const X86InstrInfo *TII = Subtarget.getInstrInfo();
34585   const DebugLoc &DL = MI.getDebugLoc();
34586 
34587   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
34588   assert(MI.getOperand(3).isGlobal() && "This should be a global");
34589 
34590   // Get a register mask for the lowered call.
34591   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
34592   // proper register mask.
34593   const uint32_t *RegMask =
34594       Subtarget.is64Bit() ?
34595       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
34596       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
34597   if (Subtarget.is64Bit()) {
34598     MachineInstrBuilder MIB =
34599         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
34600             .addReg(X86::RIP)
34601             .addImm(0)
34602             .addReg(0)
34603             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34604                               MI.getOperand(3).getTargetFlags())
34605             .addReg(0);
34606     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
34607     addDirectMem(MIB, X86::RDI);
34608     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
34609   } else if (!isPositionIndependent()) {
34610     MachineInstrBuilder MIB =
34611         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
34612             .addReg(0)
34613             .addImm(0)
34614             .addReg(0)
34615             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34616                               MI.getOperand(3).getTargetFlags())
34617             .addReg(0);
34618     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
34619     addDirectMem(MIB, X86::EAX);
34620     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34621   } else {
34622     MachineInstrBuilder MIB =
34623         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
34624             .addReg(TII->getGlobalBaseReg(F))
34625             .addImm(0)
34626             .addReg(0)
34627             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34628                               MI.getOperand(3).getTargetFlags())
34629             .addReg(0);
34630     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
34631     addDirectMem(MIB, X86::EAX);
34632     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34633   }
34634 
34635   MI.eraseFromParent(); // The pseudo instruction is gone now.
34636   return BB;
34637 }
34638 
34639 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
34640   switch (RPOpc) {
34641   case X86::INDIRECT_THUNK_CALL32:
34642     return X86::CALLpcrel32;
34643   case X86::INDIRECT_THUNK_CALL64:
34644     return X86::CALL64pcrel32;
34645   case X86::INDIRECT_THUNK_TCRETURN32:
34646     return X86::TCRETURNdi;
34647   case X86::INDIRECT_THUNK_TCRETURN64:
34648     return X86::TCRETURNdi64;
34649   }
34650   llvm_unreachable("not indirect thunk opcode");
34651 }
34652 
34653 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34654                                           unsigned Reg) {
34655   if (Subtarget.useRetpolineExternalThunk()) {
34656     // When using an external thunk for retpolines, we pick names that match the
34657     // names GCC happens to use as well. This helps simplify the implementation
34658     // of the thunks for kernels where they have no easy ability to create
34659     // aliases and are doing non-trivial configuration of the thunk's body. For
34660     // example, the Linux kernel will do boot-time hot patching of the thunk
34661     // bodies and cannot easily export aliases of these to loaded modules.
34662     //
34663     // Note that at any point in the future, we may need to change the semantics
34664     // of how we implement retpolines and at that time will likely change the
34665     // name of the called thunk. Essentially, there is no hard guarantee that
34666     // LLVM will generate calls to specific thunks, we merely make a best-effort
34667     // attempt to help out kernels and other systems where duplicating the
34668     // thunks is costly.
34669     switch (Reg) {
34670     case X86::EAX:
34671       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34672       return "__x86_indirect_thunk_eax";
34673     case X86::ECX:
34674       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34675       return "__x86_indirect_thunk_ecx";
34676     case X86::EDX:
34677       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34678       return "__x86_indirect_thunk_edx";
34679     case X86::EDI:
34680       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34681       return "__x86_indirect_thunk_edi";
34682     case X86::R11:
34683       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
34684       return "__x86_indirect_thunk_r11";
34685     }
34686     llvm_unreachable("unexpected reg for external indirect thunk");
34687   }
34688 
34689   if (Subtarget.useRetpolineIndirectCalls() ||
34690       Subtarget.useRetpolineIndirectBranches()) {
34691     // When targeting an internal COMDAT thunk use an LLVM-specific name.
34692     switch (Reg) {
34693     case X86::EAX:
34694       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34695       return "__llvm_retpoline_eax";
34696     case X86::ECX:
34697       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34698       return "__llvm_retpoline_ecx";
34699     case X86::EDX:
34700       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34701       return "__llvm_retpoline_edx";
34702     case X86::EDI:
34703       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34704       return "__llvm_retpoline_edi";
34705     case X86::R11:
34706       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
34707       return "__llvm_retpoline_r11";
34708     }
34709     llvm_unreachable("unexpected reg for retpoline");
34710   }
34711 
34712   if (Subtarget.useLVIControlFlowIntegrity()) {
34713     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
34714     return "__llvm_lvi_thunk_r11";
34715   }
34716   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
34717 }
34718 
34719 MachineBasicBlock *
34720 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
34721                                             MachineBasicBlock *BB) const {
34722   // Copy the virtual register into the R11 physical register and
34723   // call the retpoline thunk.
34724   const DebugLoc &DL = MI.getDebugLoc();
34725   const X86InstrInfo *TII = Subtarget.getInstrInfo();
34726   Register CalleeVReg = MI.getOperand(0).getReg();
34727   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
34728 
34729   // Find an available scratch register to hold the callee. On 64-bit, we can
34730   // just use R11, but we scan for uses anyway to ensure we don't generate
34731   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
34732   // already a register use operand to the call to hold the callee. If none
34733   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
34734   // register and ESI is the base pointer to realigned stack frames with VLAs.
34735   SmallVector<unsigned, 3> AvailableRegs;
34736   if (Subtarget.is64Bit())
34737     AvailableRegs.push_back(X86::R11);
34738   else
34739     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
34740 
34741   // Zero out any registers that are already used.
34742   for (const auto &MO : MI.operands()) {
34743     if (MO.isReg() && MO.isUse())
34744       for (unsigned &Reg : AvailableRegs)
34745         if (Reg == MO.getReg())
34746           Reg = 0;
34747   }
34748 
34749   // Choose the first remaining non-zero available register.
34750   unsigned AvailableReg = 0;
34751   for (unsigned MaybeReg : AvailableRegs) {
34752     if (MaybeReg) {
34753       AvailableReg = MaybeReg;
34754       break;
34755     }
34756   }
34757   if (!AvailableReg)
34758     report_fatal_error("calling convention incompatible with retpoline, no "
34759                        "available registers");
34760 
34761   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
34762 
34763   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
34764       .addReg(CalleeVReg);
34765   MI.getOperand(0).ChangeToES(Symbol);
34766   MI.setDesc(TII->get(Opc));
34767   MachineInstrBuilder(*BB->getParent(), &MI)
34768       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
34769   return BB;
34770 }
34771 
34772 /// SetJmp implies future control flow change upon calling the corresponding
34773 /// LongJmp.
34774 /// Instead of using the 'return' instruction, the long jump fixes the stack and
34775 /// performs an indirect branch. To do so it uses the registers that were stored
34776 /// in the jump buffer (when calling SetJmp).
34777 /// In case the shadow stack is enabled we need to fix it as well, because some
34778 /// return addresses will be skipped.
34779 /// The function will save the SSP for future fixing in the function
34780 /// emitLongJmpShadowStackFix.
34781 /// \sa emitLongJmpShadowStackFix
34782 /// \param [in] MI The temporary Machine Instruction for the builtin.
34783 /// \param [in] MBB The Machine Basic Block that will be modified.
34784 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
34785                                                  MachineBasicBlock *MBB) const {
34786   const DebugLoc &DL = MI.getDebugLoc();
34787   MachineFunction *MF = MBB->getParent();
34788   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34789   MachineRegisterInfo &MRI = MF->getRegInfo();
34790   MachineInstrBuilder MIB;
34791 
34792   // Memory Reference.
34793   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34794                                            MI.memoperands_end());
34795 
34796   // Initialize a register with zero.
34797   MVT PVT = getPointerTy(MF->getDataLayout());
34798   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34799   Register ZReg = MRI.createVirtualRegister(PtrRC);
34800   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
34801   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
34802       .addDef(ZReg)
34803       .addReg(ZReg, RegState::Undef)
34804       .addReg(ZReg, RegState::Undef);
34805 
34806   // Read the current SSP Register value to the zeroed register.
34807   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
34808   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
34809   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
34810 
34811   // Write the SSP register value to offset 3 in input memory buffer.
34812   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34813   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
34814   const int64_t SSPOffset = 3 * PVT.getStoreSize();
34815   const unsigned MemOpndSlot = 1;
34816   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34817     if (i == X86::AddrDisp)
34818       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
34819     else
34820       MIB.add(MI.getOperand(MemOpndSlot + i));
34821   }
34822   MIB.addReg(SSPCopyReg);
34823   MIB.setMemRefs(MMOs);
34824 }
34825 
34826 MachineBasicBlock *
34827 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
34828                                     MachineBasicBlock *MBB) const {
34829   const DebugLoc &DL = MI.getDebugLoc();
34830   MachineFunction *MF = MBB->getParent();
34831   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34832   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34833   MachineRegisterInfo &MRI = MF->getRegInfo();
34834 
34835   const BasicBlock *BB = MBB->getBasicBlock();
34836   MachineFunction::iterator I = ++MBB->getIterator();
34837 
34838   // Memory Reference
34839   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34840                                            MI.memoperands_end());
34841 
34842   unsigned DstReg;
34843   unsigned MemOpndSlot = 0;
34844 
34845   unsigned CurOp = 0;
34846 
34847   DstReg = MI.getOperand(CurOp++).getReg();
34848   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34849   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
34850   (void)TRI;
34851   Register mainDstReg = MRI.createVirtualRegister(RC);
34852   Register restoreDstReg = MRI.createVirtualRegister(RC);
34853 
34854   MemOpndSlot = CurOp;
34855 
34856   MVT PVT = getPointerTy(MF->getDataLayout());
34857   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
34858          "Invalid Pointer Size!");
34859 
34860   // For v = setjmp(buf), we generate
34861   //
34862   // thisMBB:
34863   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
34864   //  SjLjSetup restoreMBB
34865   //
34866   // mainMBB:
34867   //  v_main = 0
34868   //
34869   // sinkMBB:
34870   //  v = phi(main, restore)
34871   //
34872   // restoreMBB:
34873   //  if base pointer being used, load it from frame
34874   //  v_restore = 1
34875 
34876   MachineBasicBlock *thisMBB = MBB;
34877   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34878   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34879   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
34880   MF->insert(I, mainMBB);
34881   MF->insert(I, sinkMBB);
34882   MF->push_back(restoreMBB);
34883   restoreMBB->setHasAddressTaken();
34884 
34885   MachineInstrBuilder MIB;
34886 
34887   // Transfer the remainder of BB and its successor edges to sinkMBB.
34888   sinkMBB->splice(sinkMBB->begin(), MBB,
34889                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34890   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
34891 
34892   // thisMBB:
34893   unsigned PtrStoreOpc = 0;
34894   unsigned LabelReg = 0;
34895   const int64_t LabelOffset = 1 * PVT.getStoreSize();
34896   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
34897                      !isPositionIndependent();
34898 
34899   // Prepare IP either in reg or imm.
34900   if (!UseImmLabel) {
34901     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34902     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34903     LabelReg = MRI.createVirtualRegister(PtrRC);
34904     if (Subtarget.is64Bit()) {
34905       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
34906               .addReg(X86::RIP)
34907               .addImm(0)
34908               .addReg(0)
34909               .addMBB(restoreMBB)
34910               .addReg(0);
34911     } else {
34912       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
34913       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
34914               .addReg(XII->getGlobalBaseReg(MF))
34915               .addImm(0)
34916               .addReg(0)
34917               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
34918               .addReg(0);
34919     }
34920   } else
34921     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
34922   // Store IP
34923   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
34924   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34925     if (i == X86::AddrDisp)
34926       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
34927     else
34928       MIB.add(MI.getOperand(MemOpndSlot + i));
34929   }
34930   if (!UseImmLabel)
34931     MIB.addReg(LabelReg);
34932   else
34933     MIB.addMBB(restoreMBB);
34934   MIB.setMemRefs(MMOs);
34935 
34936   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
34937     emitSetJmpShadowStackFix(MI, thisMBB);
34938   }
34939 
34940   // Setup
34941   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
34942           .addMBB(restoreMBB);
34943 
34944   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
34945   MIB.addRegMask(RegInfo->getNoPreservedMask());
34946   thisMBB->addSuccessor(mainMBB);
34947   thisMBB->addSuccessor(restoreMBB);
34948 
34949   // mainMBB:
34950   //  EAX = 0
34951   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
34952   mainMBB->addSuccessor(sinkMBB);
34953 
34954   // sinkMBB:
34955   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
34956           TII->get(X86::PHI), DstReg)
34957     .addReg(mainDstReg).addMBB(mainMBB)
34958     .addReg(restoreDstReg).addMBB(restoreMBB);
34959 
34960   // restoreMBB:
34961   if (RegInfo->hasBasePointer(*MF)) {
34962     const bool Uses64BitFramePtr =
34963         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34964     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
34965     X86FI->setRestoreBasePointer(MF);
34966     Register FramePtr = RegInfo->getFrameRegister(*MF);
34967     Register BasePtr = RegInfo->getBaseRegister();
34968     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
34969     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
34970                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
34971       .setMIFlag(MachineInstr::FrameSetup);
34972   }
34973   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
34974   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34975   restoreMBB->addSuccessor(sinkMBB);
34976 
34977   MI.eraseFromParent();
34978   return sinkMBB;
34979 }
34980 
34981 /// Fix the shadow stack using the previously saved SSP pointer.
34982 /// \sa emitSetJmpShadowStackFix
34983 /// \param [in] MI The temporary Machine Instruction for the builtin.
34984 /// \param [in] MBB The Machine Basic Block that will be modified.
34985 /// \return The sink MBB that will perform the future indirect branch.
34986 MachineBasicBlock *
34987 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
34988                                              MachineBasicBlock *MBB) const {
34989   const DebugLoc &DL = MI.getDebugLoc();
34990   MachineFunction *MF = MBB->getParent();
34991   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34992   MachineRegisterInfo &MRI = MF->getRegInfo();
34993 
34994   // Memory Reference
34995   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34996                                            MI.memoperands_end());
34997 
34998   MVT PVT = getPointerTy(MF->getDataLayout());
34999   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35000 
35001   // checkSspMBB:
35002   //         xor vreg1, vreg1
35003   //         rdssp vreg1
35004   //         test vreg1, vreg1
35005   //         je sinkMBB   # Jump if Shadow Stack is not supported
35006   // fallMBB:
35007   //         mov buf+24/12(%rip), vreg2
35008   //         sub vreg1, vreg2
35009   //         jbe sinkMBB  # No need to fix the Shadow Stack
35010   // fixShadowMBB:
35011   //         shr 3/2, vreg2
35012   //         incssp vreg2  # fix the SSP according to the lower 8 bits
35013   //         shr 8, vreg2
35014   //         je sinkMBB
35015   // fixShadowLoopPrepareMBB:
35016   //         shl vreg2
35017   //         mov 128, vreg3
35018   // fixShadowLoopMBB:
35019   //         incssp vreg3
35020   //         dec vreg2
35021   //         jne fixShadowLoopMBB # Iterate until you finish fixing
35022   //                              # the Shadow Stack
35023   // sinkMBB:
35024 
35025   MachineFunction::iterator I = ++MBB->getIterator();
35026   const BasicBlock *BB = MBB->getBasicBlock();
35027 
35028   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35029   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35030   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35031   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35032   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35033   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35034   MF->insert(I, checkSspMBB);
35035   MF->insert(I, fallMBB);
35036   MF->insert(I, fixShadowMBB);
35037   MF->insert(I, fixShadowLoopPrepareMBB);
35038   MF->insert(I, fixShadowLoopMBB);
35039   MF->insert(I, sinkMBB);
35040 
35041   // Transfer the remainder of BB and its successor edges to sinkMBB.
35042   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35043                   MBB->end());
35044   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35045 
35046   MBB->addSuccessor(checkSspMBB);
35047 
35048   // Initialize a register with zero.
35049   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35050   BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
35051 
35052   if (PVT == MVT::i64) {
35053     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35054     BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35055       .addImm(0)
35056       .addReg(ZReg)
35057       .addImm(X86::sub_32bit);
35058     ZReg = TmpZReg;
35059   }
35060 
35061   // Read the current SSP Register value to the zeroed register.
35062   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35063   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35064   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35065 
35066   // Check whether the result of the SSP register is zero and jump directly
35067   // to the sink.
35068   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35069   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
35070       .addReg(SSPCopyReg)
35071       .addReg(SSPCopyReg);
35072   BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
35073   checkSspMBB->addSuccessor(sinkMBB);
35074   checkSspMBB->addSuccessor(fallMBB);
35075 
35076   // Reload the previously saved SSP register value.
35077   Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35078   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35079   const int64_t SPPOffset = 3 * PVT.getStoreSize();
35080   MachineInstrBuilder MIB =
35081       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
35082   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35083     const MachineOperand &MO = MI.getOperand(i);
35084     if (i == X86::AddrDisp)
35085       MIB.addDisp(MO, SPPOffset);
35086     else if (MO.isReg()) // Don't add the whole operand, we don't want to
35087                          // preserve kill flags.
35088       MIB.addReg(MO.getReg());
35089     else
35090       MIB.add(MO);
35091   }
35092   MIB.setMemRefs(MMOs);
35093 
35094   // Subtract the current SSP from the previous SSP.
35095   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35096   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35097   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
35098       .addReg(PrevSSPReg)
35099       .addReg(SSPCopyReg);
35100 
35101   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35102   BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
35103   fallMBB->addSuccessor(sinkMBB);
35104   fallMBB->addSuccessor(fixShadowMBB);
35105 
35106   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35107   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35108   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35109   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35110   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
35111       .addReg(SspSubReg)
35112       .addImm(Offset);
35113 
35114   // Increase SSP when looking only on the lower 8 bits of the delta.
35115   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35116   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35117 
35118   // Reset the lower 8 bits.
35119   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35120   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
35121       .addReg(SspFirstShrReg)
35122       .addImm(8);
35123 
35124   // Jump if the result of the shift is zero.
35125   BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
35126   fixShadowMBB->addSuccessor(sinkMBB);
35127   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35128 
35129   // Do a single shift left.
35130   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
35131   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35132   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
35133       .addReg(SspSecondShrReg);
35134 
35135   // Save the value 128 to a register (will be used next with incssp).
35136   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35137   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35138   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
35139       .addImm(128);
35140   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35141 
35142   // Since incssp only looks at the lower 8 bits, we might need to do several
35143   // iterations of incssp until we finish fixing the shadow stack.
35144   Register DecReg = MRI.createVirtualRegister(PtrRC);
35145   Register CounterReg = MRI.createVirtualRegister(PtrRC);
35146   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
35147       .addReg(SspAfterShlReg)
35148       .addMBB(fixShadowLoopPrepareMBB)
35149       .addReg(DecReg)
35150       .addMBB(fixShadowLoopMBB);
35151 
35152   // Every iteration we increase the SSP by 128.
35153   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
35154 
35155   // Every iteration we decrement the counter by 1.
35156   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35157   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
35158 
35159   // Jump if the counter is not zero yet.
35160   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
35161   fixShadowLoopMBB->addSuccessor(sinkMBB);
35162   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35163 
35164   return sinkMBB;
35165 }
35166 
35167 MachineBasicBlock *
35168 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35169                                      MachineBasicBlock *MBB) const {
35170   const DebugLoc &DL = MI.getDebugLoc();
35171   MachineFunction *MF = MBB->getParent();
35172   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35173   MachineRegisterInfo &MRI = MF->getRegInfo();
35174 
35175   // Memory Reference
35176   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35177                                            MI.memoperands_end());
35178 
35179   MVT PVT = getPointerTy(MF->getDataLayout());
35180   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35181          "Invalid Pointer Size!");
35182 
35183   const TargetRegisterClass *RC =
35184     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35185   Register Tmp = MRI.createVirtualRegister(RC);
35186   // Since FP is only updated here but NOT referenced, it's treated as GPR.
35187   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35188   Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35189   Register SP = RegInfo->getStackRegister();
35190 
35191   MachineInstrBuilder MIB;
35192 
35193   const int64_t LabelOffset = 1 * PVT.getStoreSize();
35194   const int64_t SPOffset = 2 * PVT.getStoreSize();
35195 
35196   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35197   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35198 
35199   MachineBasicBlock *thisMBB = MBB;
35200 
35201   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35202   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35203     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35204   }
35205 
35206   // Reload FP
35207   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
35208   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35209     const MachineOperand &MO = MI.getOperand(i);
35210     if (MO.isReg()) // Don't add the whole operand, we don't want to
35211                     // preserve kill flags.
35212       MIB.addReg(MO.getReg());
35213     else
35214       MIB.add(MO);
35215   }
35216   MIB.setMemRefs(MMOs);
35217 
35218   // Reload IP
35219   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
35220   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35221     const MachineOperand &MO = MI.getOperand(i);
35222     if (i == X86::AddrDisp)
35223       MIB.addDisp(MO, LabelOffset);
35224     else if (MO.isReg()) // Don't add the whole operand, we don't want to
35225                          // preserve kill flags.
35226       MIB.addReg(MO.getReg());
35227     else
35228       MIB.add(MO);
35229   }
35230   MIB.setMemRefs(MMOs);
35231 
35232   // Reload SP
35233   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
35234   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35235     if (i == X86::AddrDisp)
35236       MIB.addDisp(MI.getOperand(i), SPOffset);
35237     else
35238       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
35239                                  // the last instruction of the expansion.
35240   }
35241   MIB.setMemRefs(MMOs);
35242 
35243   // Jump
35244   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
35245 
35246   MI.eraseFromParent();
35247   return thisMBB;
35248 }
35249 
35250 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
35251                                                MachineBasicBlock *MBB,
35252                                                MachineBasicBlock *DispatchBB,
35253                                                int FI) const {
35254   const DebugLoc &DL = MI.getDebugLoc();
35255   MachineFunction *MF = MBB->getParent();
35256   MachineRegisterInfo *MRI = &MF->getRegInfo();
35257   const X86InstrInfo *TII = Subtarget.getInstrInfo();
35258 
35259   MVT PVT = getPointerTy(MF->getDataLayout());
35260   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
35261 
35262   unsigned Op = 0;
35263   unsigned VR = 0;
35264 
35265   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35266                      !isPositionIndependent();
35267 
35268   if (UseImmLabel) {
35269     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35270   } else {
35271     const TargetRegisterClass *TRC =
35272         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35273     VR = MRI->createVirtualRegister(TRC);
35274     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35275 
35276     if (Subtarget.is64Bit())
35277       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
35278           .addReg(X86::RIP)
35279           .addImm(1)
35280           .addReg(0)
35281           .addMBB(DispatchBB)
35282           .addReg(0);
35283     else
35284       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
35285           .addReg(0) /* TII->getGlobalBaseReg(MF) */
35286           .addImm(1)
35287           .addReg(0)
35288           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
35289           .addReg(0);
35290   }
35291 
35292   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
35293   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
35294   if (UseImmLabel)
35295     MIB.addMBB(DispatchBB);
35296   else
35297     MIB.addReg(VR);
35298 }
35299 
35300 MachineBasicBlock *
35301 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
35302                                          MachineBasicBlock *BB) const {
35303   const DebugLoc &DL = MI.getDebugLoc();
35304   MachineFunction *MF = BB->getParent();
35305   MachineRegisterInfo *MRI = &MF->getRegInfo();
35306   const X86InstrInfo *TII = Subtarget.getInstrInfo();
35307   int FI = MF->getFrameInfo().getFunctionContextIndex();
35308 
35309   // Get a mapping of the call site numbers to all of the landing pads they're
35310   // associated with.
35311   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
35312   unsigned MaxCSNum = 0;
35313   for (auto &MBB : *MF) {
35314     if (!MBB.isEHPad())
35315       continue;
35316 
35317     MCSymbol *Sym = nullptr;
35318     for (const auto &MI : MBB) {
35319       if (MI.isDebugInstr())
35320         continue;
35321 
35322       assert(MI.isEHLabel() && "expected EH_LABEL");
35323       Sym = MI.getOperand(0).getMCSymbol();
35324       break;
35325     }
35326 
35327     if (!MF->hasCallSiteLandingPad(Sym))
35328       continue;
35329 
35330     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
35331       CallSiteNumToLPad[CSI].push_back(&MBB);
35332       MaxCSNum = std::max(MaxCSNum, CSI);
35333     }
35334   }
35335 
35336   // Get an ordered list of the machine basic blocks for the jump table.
35337   std::vector<MachineBasicBlock *> LPadList;
35338   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
35339   LPadList.reserve(CallSiteNumToLPad.size());
35340 
35341   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
35342     for (auto &LP : CallSiteNumToLPad[CSI]) {
35343       LPadList.push_back(LP);
35344       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
35345     }
35346   }
35347 
35348   assert(!LPadList.empty() &&
35349          "No landing pad destinations for the dispatch jump table!");
35350 
35351   // Create the MBBs for the dispatch code.
35352 
35353   // Shove the dispatch's address into the return slot in the function context.
35354   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
35355   DispatchBB->setIsEHPad(true);
35356 
35357   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
35358   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
35359   DispatchBB->addSuccessor(TrapBB);
35360 
35361   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
35362   DispatchBB->addSuccessor(DispContBB);
35363 
35364   // Insert MBBs.
35365   MF->push_back(DispatchBB);
35366   MF->push_back(DispContBB);
35367   MF->push_back(TrapBB);
35368 
35369   // Insert code into the entry block that creates and registers the function
35370   // context.
35371   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
35372 
35373   // Create the jump table and associated information
35374   unsigned JTE = getJumpTableEncoding();
35375   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
35376   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
35377 
35378   const X86RegisterInfo &RI = TII->getRegisterInfo();
35379   // Add a register mask with no preserved registers.  This results in all
35380   // registers being marked as clobbered.
35381   if (RI.hasBasePointer(*MF)) {
35382     const bool FPIs64Bit =
35383         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35384     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
35385     MFI->setRestoreBasePointer(MF);
35386 
35387     Register FP = RI.getFrameRegister(*MF);
35388     Register BP = RI.getBaseRegister();
35389     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
35390     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
35391                  MFI->getRestoreBasePointerOffset())
35392         .addRegMask(RI.getNoPreservedMask());
35393   } else {
35394     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
35395         .addRegMask(RI.getNoPreservedMask());
35396   }
35397 
35398   // IReg is used as an index in a memory operand and therefore can't be SP
35399   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
35400   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
35401                     Subtarget.is64Bit() ? 8 : 4);
35402   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
35403       .addReg(IReg)
35404       .addImm(LPadList.size());
35405   BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
35406 
35407   if (Subtarget.is64Bit()) {
35408     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35409     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
35410 
35411     // leaq .LJTI0_0(%rip), BReg
35412     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
35413         .addReg(X86::RIP)
35414         .addImm(1)
35415         .addReg(0)
35416         .addJumpTableIndex(MJTI)
35417         .addReg(0);
35418     // movzx IReg64, IReg
35419     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
35420         .addImm(0)
35421         .addReg(IReg)
35422         .addImm(X86::sub_32bit);
35423 
35424     switch (JTE) {
35425     case MachineJumpTableInfo::EK_BlockAddress:
35426       // jmpq *(BReg,IReg64,8)
35427       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
35428           .addReg(BReg)
35429           .addImm(8)
35430           .addReg(IReg64)
35431           .addImm(0)
35432           .addReg(0);
35433       break;
35434     case MachineJumpTableInfo::EK_LabelDifference32: {
35435       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
35436       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
35437       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35438 
35439       // movl (BReg,IReg64,4), OReg
35440       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
35441           .addReg(BReg)
35442           .addImm(4)
35443           .addReg(IReg64)
35444           .addImm(0)
35445           .addReg(0);
35446       // movsx OReg64, OReg
35447       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
35448       // addq BReg, OReg64, TReg
35449       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
35450           .addReg(OReg64)
35451           .addReg(BReg);
35452       // jmpq *TReg
35453       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
35454       break;
35455     }
35456     default:
35457       llvm_unreachable("Unexpected jump table encoding");
35458     }
35459   } else {
35460     // jmpl *.LJTI0_0(,IReg,4)
35461     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
35462         .addReg(0)
35463         .addImm(4)
35464         .addReg(IReg)
35465         .addJumpTableIndex(MJTI)
35466         .addReg(0);
35467   }
35468 
35469   // Add the jump table entries as successors to the MBB.
35470   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
35471   for (auto &LP : LPadList)
35472     if (SeenMBBs.insert(LP).second)
35473       DispContBB->addSuccessor(LP);
35474 
35475   // N.B. the order the invoke BBs are processed in doesn't matter here.
35476   SmallVector<MachineBasicBlock *, 64> MBBLPads;
35477   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
35478   for (MachineBasicBlock *MBB : InvokeBBs) {
35479     // Remove the landing pad successor from the invoke block and replace it
35480     // with the new dispatch block.
35481     // Keep a copy of Successors since it's modified inside the loop.
35482     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
35483                                                    MBB->succ_rend());
35484     // FIXME: Avoid quadratic complexity.
35485     for (auto MBBS : Successors) {
35486       if (MBBS->isEHPad()) {
35487         MBB->removeSuccessor(MBBS);
35488         MBBLPads.push_back(MBBS);
35489       }
35490     }
35491 
35492     MBB->addSuccessor(DispatchBB);
35493 
35494     // Find the invoke call and mark all of the callee-saved registers as
35495     // 'implicit defined' so that they're spilled.  This prevents code from
35496     // moving instructions to before the EH block, where they will never be
35497     // executed.
35498     for (auto &II : reverse(*MBB)) {
35499       if (!II.isCall())
35500         continue;
35501 
35502       DenseMap<unsigned, bool> DefRegs;
35503       for (auto &MOp : II.operands())
35504         if (MOp.isReg())
35505           DefRegs[MOp.getReg()] = true;
35506 
35507       MachineInstrBuilder MIB(*MF, &II);
35508       for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
35509         unsigned Reg = SavedRegs[RegIdx];
35510         if (!DefRegs[Reg])
35511           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
35512       }
35513 
35514       break;
35515     }
35516   }
35517 
35518   // Mark all former landing pads as non-landing pads.  The dispatch is the only
35519   // landing pad now.
35520   for (auto &LP : MBBLPads)
35521     LP->setIsEHPad(false);
35522 
35523   // The instruction is gone now.
35524   MI.eraseFromParent();
35525   return BB;
35526 }
35527 
35528 MachineBasicBlock *
35529 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
35530                                                MachineBasicBlock *BB) const {
35531   MachineFunction *MF = BB->getParent();
35532   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35533   const DebugLoc &DL = MI.getDebugLoc();
35534 
35535   auto TMMImmToTMMReg = [](unsigned Imm) {
35536     assert (Imm < 8 && "Illegal tmm index");
35537     return X86::TMM0 + Imm;
35538   };
35539   switch (MI.getOpcode()) {
35540   default: llvm_unreachable("Unexpected instr type to insert");
35541   case X86::TLS_addr32:
35542   case X86::TLS_addr64:
35543   case X86::TLS_addrX32:
35544   case X86::TLS_base_addr32:
35545   case X86::TLS_base_addr64:
35546   case X86::TLS_base_addrX32:
35547     return EmitLoweredTLSAddr(MI, BB);
35548   case X86::INDIRECT_THUNK_CALL32:
35549   case X86::INDIRECT_THUNK_CALL64:
35550   case X86::INDIRECT_THUNK_TCRETURN32:
35551   case X86::INDIRECT_THUNK_TCRETURN64:
35552     return EmitLoweredIndirectThunk(MI, BB);
35553   case X86::CATCHRET:
35554     return EmitLoweredCatchRet(MI, BB);
35555   case X86::SEG_ALLOCA_32:
35556   case X86::SEG_ALLOCA_64:
35557     return EmitLoweredSegAlloca(MI, BB);
35558   case X86::PROBED_ALLOCA_32:
35559   case X86::PROBED_ALLOCA_64:
35560     return EmitLoweredProbedAlloca(MI, BB);
35561   case X86::TLSCall_32:
35562   case X86::TLSCall_64:
35563     return EmitLoweredTLSCall(MI, BB);
35564   case X86::CMOV_FR32:
35565   case X86::CMOV_FR32X:
35566   case X86::CMOV_FR64:
35567   case X86::CMOV_FR64X:
35568   case X86::CMOV_GR8:
35569   case X86::CMOV_GR16:
35570   case X86::CMOV_GR32:
35571   case X86::CMOV_RFP32:
35572   case X86::CMOV_RFP64:
35573   case X86::CMOV_RFP80:
35574   case X86::CMOV_VR64:
35575   case X86::CMOV_VR128:
35576   case X86::CMOV_VR128X:
35577   case X86::CMOV_VR256:
35578   case X86::CMOV_VR256X:
35579   case X86::CMOV_VR512:
35580   case X86::CMOV_VK1:
35581   case X86::CMOV_VK2:
35582   case X86::CMOV_VK4:
35583   case X86::CMOV_VK8:
35584   case X86::CMOV_VK16:
35585   case X86::CMOV_VK32:
35586   case X86::CMOV_VK64:
35587     return EmitLoweredSelect(MI, BB);
35588 
35589   case X86::RDFLAGS32:
35590   case X86::RDFLAGS64: {
35591     unsigned PushF =
35592         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
35593     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
35594     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
35595     // Permit reads of the EFLAGS and DF registers without them being defined.
35596     // This intrinsic exists to read external processor state in flags, such as
35597     // the trap flag, interrupt flag, and direction flag, none of which are
35598     // modeled by the backend.
35599     assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
35600            "Unexpected register in operand!");
35601     Push->getOperand(2).setIsUndef();
35602     assert(Push->getOperand(3).getReg() == X86::DF &&
35603            "Unexpected register in operand!");
35604     Push->getOperand(3).setIsUndef();
35605     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
35606 
35607     MI.eraseFromParent(); // The pseudo is gone now.
35608     return BB;
35609   }
35610 
35611   case X86::WRFLAGS32:
35612   case X86::WRFLAGS64: {
35613     unsigned Push =
35614         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
35615     unsigned PopF =
35616         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
35617     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
35618     BuildMI(*BB, MI, DL, TII->get(PopF));
35619 
35620     MI.eraseFromParent(); // The pseudo is gone now.
35621     return BB;
35622   }
35623 
35624   case X86::FP32_TO_INT16_IN_MEM:
35625   case X86::FP32_TO_INT32_IN_MEM:
35626   case X86::FP32_TO_INT64_IN_MEM:
35627   case X86::FP64_TO_INT16_IN_MEM:
35628   case X86::FP64_TO_INT32_IN_MEM:
35629   case X86::FP64_TO_INT64_IN_MEM:
35630   case X86::FP80_TO_INT16_IN_MEM:
35631   case X86::FP80_TO_INT32_IN_MEM:
35632   case X86::FP80_TO_INT64_IN_MEM: {
35633     // Change the floating point control register to use "round towards zero"
35634     // mode when truncating to an integer value.
35635     int OrigCWFrameIdx =
35636         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35637     addFrameReference(BuildMI(*BB, MI, DL,
35638                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
35639 
35640     // Load the old value of the control word...
35641     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35642     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
35643                       OrigCWFrameIdx);
35644 
35645     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
35646     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35647     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
35648       .addReg(OldCW, RegState::Kill).addImm(0xC00);
35649 
35650     // Extract to 16 bits.
35651     Register NewCW16 =
35652         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35653     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
35654       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35655 
35656     // Prepare memory for FLDCW.
35657     int NewCWFrameIdx =
35658         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35659     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
35660                       NewCWFrameIdx)
35661       .addReg(NewCW16, RegState::Kill);
35662 
35663     // Reload the modified control word now...
35664     addFrameReference(BuildMI(*BB, MI, DL,
35665                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
35666 
35667     // Get the X86 opcode to use.
35668     unsigned Opc;
35669     switch (MI.getOpcode()) {
35670     default: llvm_unreachable("illegal opcode!");
35671     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
35672     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
35673     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
35674     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
35675     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
35676     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
35677     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
35678     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
35679     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
35680     }
35681 
35682     X86AddressMode AM = getAddressFromInstr(&MI, 0);
35683     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
35684         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
35685 
35686     // Reload the original control word now.
35687     addFrameReference(BuildMI(*BB, MI, DL,
35688                               TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
35689 
35690     MI.eraseFromParent(); // The pseudo instruction is gone now.
35691     return BB;
35692   }
35693 
35694   // xbegin
35695   case X86::XBEGIN:
35696     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
35697 
35698   case X86::VAARG_64:
35699   case X86::VAARG_X32:
35700     return EmitVAARGWithCustomInserter(MI, BB);
35701 
35702   case X86::EH_SjLj_SetJmp32:
35703   case X86::EH_SjLj_SetJmp64:
35704     return emitEHSjLjSetJmp(MI, BB);
35705 
35706   case X86::EH_SjLj_LongJmp32:
35707   case X86::EH_SjLj_LongJmp64:
35708     return emitEHSjLjLongJmp(MI, BB);
35709 
35710   case X86::Int_eh_sjlj_setup_dispatch:
35711     return EmitSjLjDispatchBlock(MI, BB);
35712 
35713   case TargetOpcode::STATEPOINT:
35714     // As an implementation detail, STATEPOINT shares the STACKMAP format at
35715     // this point in the process.  We diverge later.
35716     return emitPatchPoint(MI, BB);
35717 
35718   case TargetOpcode::STACKMAP:
35719   case TargetOpcode::PATCHPOINT:
35720     return emitPatchPoint(MI, BB);
35721 
35722   case TargetOpcode::PATCHABLE_EVENT_CALL:
35723   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
35724     return BB;
35725 
35726   case X86::LCMPXCHG8B: {
35727     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35728     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
35729     // requires a memory operand. If it happens that current architecture is
35730     // i686 and for current function we need a base pointer
35731     // - which is ESI for i686 - register allocator would not be able to
35732     // allocate registers for an address in form of X(%reg, %reg, Y)
35733     // - there never would be enough unreserved registers during regalloc
35734     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
35735     // We are giving a hand to register allocator by precomputing the address in
35736     // a new vreg using LEA.
35737 
35738     // If it is not i686 or there is no base pointer - nothing to do here.
35739     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
35740       return BB;
35741 
35742     // Even though this code does not necessarily needs the base pointer to
35743     // be ESI, we check for that. The reason: if this assert fails, there are
35744     // some changes happened in the compiler base pointer handling, which most
35745     // probably have to be addressed somehow here.
35746     assert(TRI->getBaseRegister() == X86::ESI &&
35747            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
35748            "base pointer in mind");
35749 
35750     MachineRegisterInfo &MRI = MF->getRegInfo();
35751     MVT SPTy = getPointerTy(MF->getDataLayout());
35752     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
35753     Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
35754 
35755     X86AddressMode AM = getAddressFromInstr(&MI, 0);
35756     // Regalloc does not need any help when the memory operand of CMPXCHG8B
35757     // does not use index register.
35758     if (AM.IndexReg == X86::NoRegister)
35759       return BB;
35760 
35761     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
35762     // four operand definitions that are E[ABCD] registers. We skip them and
35763     // then insert the LEA.
35764     MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
35765     while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
35766                                    RMBBI->definesRegister(X86::EBX) ||
35767                                    RMBBI->definesRegister(X86::ECX) ||
35768                                    RMBBI->definesRegister(X86::EDX))) {
35769       ++RMBBI;
35770     }
35771     MachineBasicBlock::iterator MBBI(RMBBI);
35772     addFullAddress(
35773         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
35774 
35775     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
35776 
35777     return BB;
35778   }
35779   case X86::LCMPXCHG16B_NO_RBX: {
35780     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35781     Register BasePtr = TRI->getBaseRegister();
35782     if (TRI->hasBasePointer(*MF) &&
35783         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
35784       if (!BB->isLiveIn(BasePtr))
35785         BB->addLiveIn(BasePtr);
35786       // Save RBX into a virtual register.
35787       Register SaveRBX =
35788           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35789       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
35790           .addReg(X86::RBX);
35791       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35792       MachineInstrBuilder MIB =
35793           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
35794       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
35795         MIB.add(MI.getOperand(Idx));
35796       MIB.add(MI.getOperand(X86::AddrNumOperands));
35797       MIB.addReg(SaveRBX);
35798     } else {
35799       // Simple case, just copy the virtual register to RBX.
35800       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
35801           .add(MI.getOperand(X86::AddrNumOperands));
35802       MachineInstrBuilder MIB =
35803           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
35804       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
35805         MIB.add(MI.getOperand(Idx));
35806     }
35807     MI.eraseFromParent();
35808     return BB;
35809   }
35810   case X86::MWAITX: {
35811     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35812     Register BasePtr = TRI->getBaseRegister();
35813     bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
35814     // If no need to save the base pointer, we generate MWAITXrrr,
35815     // else we generate pseudo MWAITX_SAVE_RBX.
35816     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
35817       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
35818           .addReg(MI.getOperand(0).getReg());
35819       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
35820           .addReg(MI.getOperand(1).getReg());
35821       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
35822           .addReg(MI.getOperand(2).getReg());
35823       BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
35824       MI.eraseFromParent();
35825     } else {
35826       if (!BB->isLiveIn(BasePtr)) {
35827         BB->addLiveIn(BasePtr);
35828       }
35829       // Parameters can be copied into ECX and EAX but not EBX yet.
35830       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
35831           .addReg(MI.getOperand(0).getReg());
35832       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
35833           .addReg(MI.getOperand(1).getReg());
35834       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
35835       // Save RBX into a virtual register.
35836       Register SaveRBX =
35837           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35838       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
35839           .addReg(X86::RBX);
35840       // Generate mwaitx pseudo.
35841       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35842       BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
35843           .addDef(Dst) // Destination tied in with SaveRBX.
35844           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
35845           .addUse(SaveRBX);                  // Save of base pointer.
35846       MI.eraseFromParent();
35847     }
35848     return BB;
35849   }
35850   case TargetOpcode::PREALLOCATED_SETUP: {
35851     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
35852     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
35853     MFI->setHasPreallocatedCall(true);
35854     int64_t PreallocatedId = MI.getOperand(0).getImm();
35855     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
35856     assert(StackAdjustment != 0 && "0 stack adjustment");
35857     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
35858                       << StackAdjustment << "\n");
35859     BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
35860         .addReg(X86::ESP)
35861         .addImm(StackAdjustment);
35862     MI.eraseFromParent();
35863     return BB;
35864   }
35865   case TargetOpcode::PREALLOCATED_ARG: {
35866     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
35867     int64_t PreallocatedId = MI.getOperand(1).getImm();
35868     int64_t ArgIdx = MI.getOperand(2).getImm();
35869     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
35870     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
35871     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
35872                       << ", arg offset " << ArgOffset << "\n");
35873     // stack pointer + offset
35874     addRegOffset(
35875         BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
35876         X86::ESP, false, ArgOffset);
35877     MI.eraseFromParent();
35878     return BB;
35879   }
35880   case X86::PTDPBSSD:
35881   case X86::PTDPBSUD:
35882   case X86::PTDPBUSD:
35883   case X86::PTDPBUUD:
35884   case X86::PTDPBF16PS: {
35885     unsigned Opc;
35886     switch (MI.getOpcode()) {
35887     default: llvm_unreachable("illegal opcode!");
35888     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
35889     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
35890     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
35891     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
35892     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
35893     }
35894 
35895     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
35896     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
35897     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
35898     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
35899     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
35900 
35901     MI.eraseFromParent(); // The pseudo is gone now.
35902     return BB;
35903   }
35904   case X86::PTILEZERO: {
35905     unsigned Imm = MI.getOperand(0).getImm();
35906     BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
35907     MI.eraseFromParent(); // The pseudo is gone now.
35908     return BB;
35909   }
35910   case X86::PTILELOADD:
35911   case X86::PTILELOADDT1:
35912   case X86::PTILESTORED: {
35913     unsigned Opc;
35914     switch (MI.getOpcode()) {
35915     default: llvm_unreachable("illegal opcode!");
35916     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
35917     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
35918     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
35919     }
35920 
35921     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
35922     unsigned CurOp = 0;
35923     if (Opc != X86::TILESTORED)
35924       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
35925                  RegState::Define);
35926 
35927     MIB.add(MI.getOperand(CurOp++)); // base
35928     MIB.add(MI.getOperand(CurOp++)); // scale
35929     MIB.add(MI.getOperand(CurOp++)); // index -- stride
35930     MIB.add(MI.getOperand(CurOp++)); // displacement
35931     MIB.add(MI.getOperand(CurOp++)); // segment
35932 
35933     if (Opc == X86::TILESTORED)
35934       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
35935                  RegState::Undef);
35936 
35937     MI.eraseFromParent(); // The pseudo is gone now.
35938     return BB;
35939   }
35940   }
35941 }
35942 
35943 //===----------------------------------------------------------------------===//
35944 //                           X86 Optimization Hooks
35945 //===----------------------------------------------------------------------===//
35946 
35947 bool
35948 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
35949                                                 const APInt &DemandedBits,
35950                                                 const APInt &DemandedElts,
35951                                                 TargetLoweringOpt &TLO) const {
35952   EVT VT = Op.getValueType();
35953   unsigned Opcode = Op.getOpcode();
35954   unsigned EltSize = VT.getScalarSizeInBits();
35955 
35956   if (VT.isVector()) {
35957     // If the constant is only all signbits in the active bits, then we should
35958     // extend it to the entire constant to allow it act as a boolean constant
35959     // vector.
35960     auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
35961       if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
35962         return false;
35963       for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
35964         if (!DemandedElts[i] || V.getOperand(i).isUndef())
35965           continue;
35966         const APInt &Val = V.getConstantOperandAPInt(i);
35967         if (Val.getBitWidth() > Val.getNumSignBits() &&
35968             Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
35969           return true;
35970       }
35971       return false;
35972     };
35973     // For vectors - if we have a constant, then try to sign extend.
35974     // TODO: Handle AND/ANDN cases.
35975     unsigned ActiveBits = DemandedBits.getActiveBits();
35976     if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
35977         (Opcode == ISD::OR || Opcode == ISD::XOR) &&
35978         NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
35979       EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
35980       EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
35981                                     VT.getVectorNumElements());
35982       SDValue NewC =
35983           TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
35984                           Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
35985       SDValue NewOp =
35986           TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
35987       return TLO.CombineTo(Op, NewOp);
35988     }
35989     return false;
35990   }
35991 
35992   // Only optimize Ands to prevent shrinking a constant that could be
35993   // matched by movzx.
35994   if (Opcode != ISD::AND)
35995     return false;
35996 
35997   // Make sure the RHS really is a constant.
35998   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
35999   if (!C)
36000     return false;
36001 
36002   const APInt &Mask = C->getAPIntValue();
36003 
36004   // Clear all non-demanded bits initially.
36005   APInt ShrunkMask = Mask & DemandedBits;
36006 
36007   // Find the width of the shrunk mask.
36008   unsigned Width = ShrunkMask.getActiveBits();
36009 
36010   // If the mask is all 0s there's nothing to do here.
36011   if (Width == 0)
36012     return false;
36013 
36014   // Find the next power of 2 width, rounding up to a byte.
36015   Width = PowerOf2Ceil(std::max(Width, 8U));
36016   // Truncate the width to size to handle illegal types.
36017   Width = std::min(Width, EltSize);
36018 
36019   // Calculate a possible zero extend mask for this constant.
36020   APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36021 
36022   // If we aren't changing the mask, just return true to keep it and prevent
36023   // the caller from optimizing.
36024   if (ZeroExtendMask == Mask)
36025     return true;
36026 
36027   // Make sure the new mask can be represented by a combination of mask bits
36028   // and non-demanded bits.
36029   if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36030     return false;
36031 
36032   // Replace the constant with the zero extend mask.
36033   SDLoc DL(Op);
36034   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36035   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36036   return TLO.CombineTo(Op, NewOp);
36037 }
36038 
36039 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
36040                                                       KnownBits &Known,
36041                                                       const APInt &DemandedElts,
36042                                                       const SelectionDAG &DAG,
36043                                                       unsigned Depth) const {
36044   unsigned BitWidth = Known.getBitWidth();
36045   unsigned NumElts = DemandedElts.getBitWidth();
36046   unsigned Opc = Op.getOpcode();
36047   EVT VT = Op.getValueType();
36048   assert((Opc >= ISD::BUILTIN_OP_END ||
36049           Opc == ISD::INTRINSIC_WO_CHAIN ||
36050           Opc == ISD::INTRINSIC_W_CHAIN ||
36051           Opc == ISD::INTRINSIC_VOID) &&
36052          "Should use MaskedValueIsZero if you don't know whether Op"
36053          " is a target node!");
36054 
36055   Known.resetAll();
36056   switch (Opc) {
36057   default: break;
36058   case X86ISD::SETCC:
36059     Known.Zero.setBitsFrom(1);
36060     break;
36061   case X86ISD::MOVMSK: {
36062     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36063     Known.Zero.setBitsFrom(NumLoBits);
36064     break;
36065   }
36066   case X86ISD::PEXTRB:
36067   case X86ISD::PEXTRW: {
36068     SDValue Src = Op.getOperand(0);
36069     EVT SrcVT = Src.getValueType();
36070     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36071                                             Op.getConstantOperandVal(1));
36072     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36073     Known = Known.anyextOrTrunc(BitWidth);
36074     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36075     break;
36076   }
36077   case X86ISD::VSRAI:
36078   case X86ISD::VSHLI:
36079   case X86ISD::VSRLI: {
36080     unsigned ShAmt = Op.getConstantOperandVal(1);
36081     if (ShAmt >= VT.getScalarSizeInBits()) {
36082       Known.setAllZero();
36083       break;
36084     }
36085 
36086     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36087     if (Opc == X86ISD::VSHLI) {
36088       Known.Zero <<= ShAmt;
36089       Known.One <<= ShAmt;
36090       // Low bits are known zero.
36091       Known.Zero.setLowBits(ShAmt);
36092     } else if (Opc == X86ISD::VSRLI) {
36093       Known.Zero.lshrInPlace(ShAmt);
36094       Known.One.lshrInPlace(ShAmt);
36095       // High bits are known zero.
36096       Known.Zero.setHighBits(ShAmt);
36097     } else {
36098       Known.Zero.ashrInPlace(ShAmt);
36099       Known.One.ashrInPlace(ShAmt);
36100     }
36101     break;
36102   }
36103   case X86ISD::PACKUS: {
36104     // PACKUS is just a truncation if the upper half is zero.
36105     APInt DemandedLHS, DemandedRHS;
36106     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36107 
36108     Known.One = APInt::getAllOnes(BitWidth * 2);
36109     Known.Zero = APInt::getAllOnes(BitWidth * 2);
36110 
36111     KnownBits Known2;
36112     if (!!DemandedLHS) {
36113       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36114       Known = KnownBits::commonBits(Known, Known2);
36115     }
36116     if (!!DemandedRHS) {
36117       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36118       Known = KnownBits::commonBits(Known, Known2);
36119     }
36120 
36121     if (Known.countMinLeadingZeros() < BitWidth)
36122       Known.resetAll();
36123     Known = Known.trunc(BitWidth);
36124     break;
36125   }
36126   case X86ISD::VBROADCAST: {
36127     SDValue Src = Op.getOperand(0);
36128     if (!Src.getSimpleValueType().isVector()) {
36129       Known = DAG.computeKnownBits(Src, Depth + 1);
36130       return;
36131     }
36132     break;
36133   }
36134   case X86ISD::ANDNP: {
36135     KnownBits Known2;
36136     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36137     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36138 
36139     // ANDNP = (~X & Y);
36140     Known.One &= Known2.Zero;
36141     Known.Zero |= Known2.One;
36142     break;
36143   }
36144   case X86ISD::FOR: {
36145     KnownBits Known2;
36146     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36147     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36148 
36149     Known |= Known2;
36150     break;
36151   }
36152   case X86ISD::PSADBW: {
36153     assert(VT.getScalarType() == MVT::i64 &&
36154            Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
36155            "Unexpected PSADBW types");
36156 
36157     // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36158     Known.Zero.setBitsFrom(16);
36159     break;
36160   }
36161   case X86ISD::PMULUDQ: {
36162     KnownBits Known2;
36163     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36164     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36165 
36166     Known = Known.trunc(BitWidth / 2).zext(BitWidth);
36167     Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
36168     Known = KnownBits::mul(Known, Known2);
36169     break;
36170   }
36171   case X86ISD::CMOV: {
36172     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
36173     // If we don't know any bits, early out.
36174     if (Known.isUnknown())
36175       break;
36176     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
36177 
36178     // Only known if known in both the LHS and RHS.
36179     Known = KnownBits::commonBits(Known, Known2);
36180     break;
36181   }
36182   case X86ISD::BEXTR:
36183   case X86ISD::BEXTRI: {
36184     SDValue Op0 = Op.getOperand(0);
36185     SDValue Op1 = Op.getOperand(1);
36186 
36187     if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36188       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
36189       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
36190 
36191       // If the length is 0, the result is 0.
36192       if (Length == 0) {
36193         Known.setAllZero();
36194         break;
36195       }
36196 
36197       if ((Shift + Length) <= BitWidth) {
36198         Known = DAG.computeKnownBits(Op0, Depth + 1);
36199         Known = Known.extractBits(Length, Shift);
36200         Known = Known.zextOrTrunc(BitWidth);
36201       }
36202     }
36203     break;
36204   }
36205   case X86ISD::PDEP: {
36206     KnownBits Known2;
36207     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36208     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36209     // Zeros are retained from the mask operand. But not ones.
36210     Known.One.clearAllBits();
36211     // The result will have at least as many trailing zeros as the non-mask
36212     // operand since bits can only map to the same or higher bit position.
36213     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
36214     break;
36215   }
36216   case X86ISD::PEXT: {
36217     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36218     // The result has as many leading zeros as the number of zeroes in the mask.
36219     unsigned Count = Known.Zero.countPopulation();
36220     Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
36221     Known.One.clearAllBits();
36222     break;
36223   }
36224   case X86ISD::VTRUNC:
36225   case X86ISD::VTRUNCS:
36226   case X86ISD::VTRUNCUS:
36227   case X86ISD::CVTSI2P:
36228   case X86ISD::CVTUI2P:
36229   case X86ISD::CVTP2SI:
36230   case X86ISD::CVTP2UI:
36231   case X86ISD::MCVTP2SI:
36232   case X86ISD::MCVTP2UI:
36233   case X86ISD::CVTTP2SI:
36234   case X86ISD::CVTTP2UI:
36235   case X86ISD::MCVTTP2SI:
36236   case X86ISD::MCVTTP2UI:
36237   case X86ISD::MCVTSI2P:
36238   case X86ISD::MCVTUI2P:
36239   case X86ISD::VFPROUND:
36240   case X86ISD::VMFPROUND:
36241   case X86ISD::CVTPS2PH:
36242   case X86ISD::MCVTPS2PH: {
36243     // Truncations/Conversions - upper elements are known zero.
36244     EVT SrcVT = Op.getOperand(0).getValueType();
36245     if (SrcVT.isVector()) {
36246       unsigned NumSrcElts = SrcVT.getVectorNumElements();
36247       if (NumElts > NumSrcElts &&
36248           DemandedElts.countTrailingZeros() >= NumSrcElts)
36249         Known.setAllZero();
36250     }
36251     break;
36252   }
36253   case X86ISD::STRICT_CVTTP2SI:
36254   case X86ISD::STRICT_CVTTP2UI:
36255   case X86ISD::STRICT_CVTSI2P:
36256   case X86ISD::STRICT_CVTUI2P:
36257   case X86ISD::STRICT_VFPROUND:
36258   case X86ISD::STRICT_CVTPS2PH: {
36259     // Strict Conversions - upper elements are known zero.
36260     EVT SrcVT = Op.getOperand(1).getValueType();
36261     if (SrcVT.isVector()) {
36262       unsigned NumSrcElts = SrcVT.getVectorNumElements();
36263       if (NumElts > NumSrcElts &&
36264           DemandedElts.countTrailingZeros() >= NumSrcElts)
36265         Known.setAllZero();
36266     }
36267     break;
36268   }
36269   case X86ISD::MOVQ2DQ: {
36270     // Move from MMX to XMM. Upper half of XMM should be 0.
36271     if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
36272       Known.setAllZero();
36273     break;
36274   }
36275   }
36276 
36277   // Handle target shuffles.
36278   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36279   if (isTargetShuffle(Opc)) {
36280     SmallVector<int, 64> Mask;
36281     SmallVector<SDValue, 2> Ops;
36282     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36283       unsigned NumOps = Ops.size();
36284       unsigned NumElts = VT.getVectorNumElements();
36285       if (Mask.size() == NumElts) {
36286         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36287         Known.Zero.setAllBits(); Known.One.setAllBits();
36288         for (unsigned i = 0; i != NumElts; ++i) {
36289           if (!DemandedElts[i])
36290             continue;
36291           int M = Mask[i];
36292           if (M == SM_SentinelUndef) {
36293             // For UNDEF elements, we don't know anything about the common state
36294             // of the shuffle result.
36295             Known.resetAll();
36296             break;
36297           }
36298           if (M == SM_SentinelZero) {
36299             Known.One.clearAllBits();
36300             continue;
36301           }
36302           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
36303                  "Shuffle index out of range");
36304 
36305           unsigned OpIdx = (unsigned)M / NumElts;
36306           unsigned EltIdx = (unsigned)M % NumElts;
36307           if (Ops[OpIdx].getValueType() != VT) {
36308             // TODO - handle target shuffle ops with different value types.
36309             Known.resetAll();
36310             break;
36311           }
36312           DemandedOps[OpIdx].setBit(EltIdx);
36313         }
36314         // Known bits are the values that are shared by every demanded element.
36315         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
36316           if (!DemandedOps[i])
36317             continue;
36318           KnownBits Known2 =
36319               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
36320           Known = KnownBits::commonBits(Known, Known2);
36321         }
36322       }
36323     }
36324   }
36325 }
36326 
36327 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
36328     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
36329     unsigned Depth) const {
36330   EVT VT = Op.getValueType();
36331   unsigned VTBits = VT.getScalarSizeInBits();
36332   unsigned Opcode = Op.getOpcode();
36333   switch (Opcode) {
36334   case X86ISD::SETCC_CARRY:
36335     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
36336     return VTBits;
36337 
36338   case X86ISD::VTRUNC: {
36339     SDValue Src = Op.getOperand(0);
36340     MVT SrcVT = Src.getSimpleValueType();
36341     unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
36342     assert(VTBits < NumSrcBits && "Illegal truncation input type");
36343     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36344     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
36345     if (Tmp > (NumSrcBits - VTBits))
36346       return Tmp - (NumSrcBits - VTBits);
36347     return 1;
36348   }
36349 
36350   case X86ISD::PACKSS: {
36351     // PACKSS is just a truncation if the sign bits extend to the packed size.
36352     APInt DemandedLHS, DemandedRHS;
36353     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
36354                         DemandedRHS);
36355 
36356     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
36357     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
36358     if (!!DemandedLHS)
36359       Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36360     if (!!DemandedRHS)
36361       Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36362     unsigned Tmp = std::min(Tmp0, Tmp1);
36363     if (Tmp > (SrcBits - VTBits))
36364       return Tmp - (SrcBits - VTBits);
36365     return 1;
36366   }
36367 
36368   case X86ISD::VBROADCAST: {
36369     SDValue Src = Op.getOperand(0);
36370     if (!Src.getSimpleValueType().isVector())
36371       return DAG.ComputeNumSignBits(Src, Depth + 1);
36372     break;
36373   }
36374 
36375   case X86ISD::VSHLI: {
36376     SDValue Src = Op.getOperand(0);
36377     const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
36378     if (ShiftVal.uge(VTBits))
36379       return VTBits; // Shifted all bits out --> zero.
36380     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36381     if (ShiftVal.uge(Tmp))
36382       return 1; // Shifted all sign bits out --> unknown.
36383     return Tmp - ShiftVal.getZExtValue();
36384   }
36385 
36386   case X86ISD::VSRAI: {
36387     SDValue Src = Op.getOperand(0);
36388     APInt ShiftVal = Op.getConstantOperandAPInt(1);
36389     if (ShiftVal.uge(VTBits - 1))
36390       return VTBits; // Sign splat.
36391     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36392     ShiftVal += Tmp;
36393     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
36394   }
36395 
36396   case X86ISD::FSETCC:
36397     // cmpss/cmpsd return zero/all-bits result values in the bottom element.
36398     if (VT == MVT::f32 || VT == MVT::f64 ||
36399         ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
36400       return VTBits;
36401     break;
36402 
36403   case X86ISD::PCMPGT:
36404   case X86ISD::PCMPEQ:
36405   case X86ISD::CMPP:
36406   case X86ISD::VPCOM:
36407   case X86ISD::VPCOMU:
36408     // Vector compares return zero/all-bits result values.
36409     return VTBits;
36410 
36411   case X86ISD::ANDNP: {
36412     unsigned Tmp0 =
36413         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
36414     if (Tmp0 == 1) return 1; // Early out.
36415     unsigned Tmp1 =
36416         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
36417     return std::min(Tmp0, Tmp1);
36418   }
36419 
36420   case X86ISD::CMOV: {
36421     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
36422     if (Tmp0 == 1) return 1;  // Early out.
36423     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
36424     return std::min(Tmp0, Tmp1);
36425   }
36426   }
36427 
36428   // Handle target shuffles.
36429   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36430   if (isTargetShuffle(Opcode)) {
36431     SmallVector<int, 64> Mask;
36432     SmallVector<SDValue, 2> Ops;
36433     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36434       unsigned NumOps = Ops.size();
36435       unsigned NumElts = VT.getVectorNumElements();
36436       if (Mask.size() == NumElts) {
36437         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36438         for (unsigned i = 0; i != NumElts; ++i) {
36439           if (!DemandedElts[i])
36440             continue;
36441           int M = Mask[i];
36442           if (M == SM_SentinelUndef) {
36443             // For UNDEF elements, we don't know anything about the common state
36444             // of the shuffle result.
36445             return 1;
36446           } else if (M == SM_SentinelZero) {
36447             // Zero = all sign bits.
36448             continue;
36449           }
36450           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
36451                  "Shuffle index out of range");
36452 
36453           unsigned OpIdx = (unsigned)M / NumElts;
36454           unsigned EltIdx = (unsigned)M % NumElts;
36455           if (Ops[OpIdx].getValueType() != VT) {
36456             // TODO - handle target shuffle ops with different value types.
36457             return 1;
36458           }
36459           DemandedOps[OpIdx].setBit(EltIdx);
36460         }
36461         unsigned Tmp0 = VTBits;
36462         for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
36463           if (!DemandedOps[i])
36464             continue;
36465           unsigned Tmp1 =
36466               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
36467           Tmp0 = std::min(Tmp0, Tmp1);
36468         }
36469         return Tmp0;
36470       }
36471     }
36472   }
36473 
36474   // Fallback case.
36475   return 1;
36476 }
36477 
36478 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
36479   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
36480     return N->getOperand(0);
36481   return N;
36482 }
36483 
36484 // Helper to look for a normal load that can be narrowed into a vzload with the
36485 // specified VT and memory VT. Returns SDValue() on failure.
36486 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
36487                                   SelectionDAG &DAG) {
36488   // Can't if the load is volatile or atomic.
36489   if (!LN->isSimple())
36490     return SDValue();
36491 
36492   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36493   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36494   return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
36495                                  LN->getPointerInfo(), LN->getOriginalAlign(),
36496                                  LN->getMemOperand()->getFlags());
36497 }
36498 
36499 // Attempt to match a combined shuffle mask against supported unary shuffle
36500 // instructions.
36501 // TODO: Investigate sharing more of this with shuffle lowering.
36502 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36503                               bool AllowFloatDomain, bool AllowIntDomain,
36504                               SDValue V1, const X86Subtarget &Subtarget,
36505                               unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
36506   unsigned NumMaskElts = Mask.size();
36507   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
36508 
36509   // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
36510   if (Mask[0] == 0 &&
36511       (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
36512     if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
36513         (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36514          isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
36515       Shuffle = X86ISD::VZEXT_MOVL;
36516       if (MaskEltSize == 16)
36517         SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36518       else
36519         SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36520       return true;
36521     }
36522   }
36523 
36524   // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
36525   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
36526   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
36527                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
36528     unsigned MaxScale = 64 / MaskEltSize;
36529     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
36530       bool MatchAny = true;
36531       bool MatchZero = true;
36532       unsigned NumDstElts = NumMaskElts / Scale;
36533       for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
36534         if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
36535           MatchAny = MatchZero = false;
36536           break;
36537         }
36538         MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
36539         MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
36540       }
36541       if (MatchAny || MatchZero) {
36542         assert(MatchZero && "Failed to match zext but matched aext?");
36543         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
36544         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
36545                                             MVT::getIntegerVT(MaskEltSize);
36546         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
36547 
36548         Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
36549         if (SrcVT.getVectorNumElements() != NumDstElts)
36550           Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
36551 
36552         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
36553         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
36554         return true;
36555       }
36556     }
36557   }
36558 
36559   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
36560   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
36561        (MaskEltSize == 16 && Subtarget.hasFP16())) &&
36562       isUndefOrEqual(Mask[0], 0) &&
36563       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
36564     Shuffle = X86ISD::VZEXT_MOVL;
36565     if (MaskEltSize == 16)
36566       SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36567     else
36568       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36569     return true;
36570   }
36571 
36572   // Check if we have SSE3 which will let us use MOVDDUP etc. The
36573   // instructions are no slower than UNPCKLPD but has the option to
36574   // fold the input operand into even an unaligned memory load.
36575   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
36576     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
36577       Shuffle = X86ISD::MOVDDUP;
36578       SrcVT = DstVT = MVT::v2f64;
36579       return true;
36580     }
36581     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
36582       Shuffle = X86ISD::MOVSLDUP;
36583       SrcVT = DstVT = MVT::v4f32;
36584       return true;
36585     }
36586     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
36587       Shuffle = X86ISD::MOVSHDUP;
36588       SrcVT = DstVT = MVT::v4f32;
36589       return true;
36590     }
36591   }
36592 
36593   if (MaskVT.is256BitVector() && AllowFloatDomain) {
36594     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
36595     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
36596       Shuffle = X86ISD::MOVDDUP;
36597       SrcVT = DstVT = MVT::v4f64;
36598       return true;
36599     }
36600     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
36601       Shuffle = X86ISD::MOVSLDUP;
36602       SrcVT = DstVT = MVT::v8f32;
36603       return true;
36604     }
36605     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
36606       Shuffle = X86ISD::MOVSHDUP;
36607       SrcVT = DstVT = MVT::v8f32;
36608       return true;
36609     }
36610   }
36611 
36612   if (MaskVT.is512BitVector() && AllowFloatDomain) {
36613     assert(Subtarget.hasAVX512() &&
36614            "AVX512 required for 512-bit vector shuffles");
36615     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
36616       Shuffle = X86ISD::MOVDDUP;
36617       SrcVT = DstVT = MVT::v8f64;
36618       return true;
36619     }
36620     if (isTargetShuffleEquivalent(
36621             MaskVT, Mask,
36622             {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
36623       Shuffle = X86ISD::MOVSLDUP;
36624       SrcVT = DstVT = MVT::v16f32;
36625       return true;
36626     }
36627     if (isTargetShuffleEquivalent(
36628             MaskVT, Mask,
36629             {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
36630       Shuffle = X86ISD::MOVSHDUP;
36631       SrcVT = DstVT = MVT::v16f32;
36632       return true;
36633     }
36634   }
36635 
36636   return false;
36637 }
36638 
36639 // Attempt to match a combined shuffle mask against supported unary immediate
36640 // permute instructions.
36641 // TODO: Investigate sharing more of this with shuffle lowering.
36642 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
36643                                      const APInt &Zeroable,
36644                                      bool AllowFloatDomain, bool AllowIntDomain,
36645                                      const X86Subtarget &Subtarget,
36646                                      unsigned &Shuffle, MVT &ShuffleVT,
36647                                      unsigned &PermuteImm) {
36648   unsigned NumMaskElts = Mask.size();
36649   unsigned InputSizeInBits = MaskVT.getSizeInBits();
36650   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
36651   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
36652   bool ContainsZeros = isAnyZero(Mask);
36653 
36654   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
36655   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
36656     // Check for lane crossing permutes.
36657     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
36658       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
36659       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
36660         Shuffle = X86ISD::VPERMI;
36661         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
36662         PermuteImm = getV4X86ShuffleImm(Mask);
36663         return true;
36664       }
36665       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
36666         SmallVector<int, 4> RepeatedMask;
36667         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
36668           Shuffle = X86ISD::VPERMI;
36669           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
36670           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
36671           return true;
36672         }
36673       }
36674     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
36675       // VPERMILPD can permute with a non-repeating shuffle.
36676       Shuffle = X86ISD::VPERMILPI;
36677       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
36678       PermuteImm = 0;
36679       for (int i = 0, e = Mask.size(); i != e; ++i) {
36680         int M = Mask[i];
36681         if (M == SM_SentinelUndef)
36682           continue;
36683         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
36684         PermuteImm |= (M & 1) << i;
36685       }
36686       return true;
36687     }
36688   }
36689 
36690   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
36691   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
36692   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
36693   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
36694       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
36695     SmallVector<int, 4> RepeatedMask;
36696     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36697       // Narrow the repeated mask to create 32-bit element permutes.
36698       SmallVector<int, 4> WordMask = RepeatedMask;
36699       if (MaskScalarSizeInBits == 64)
36700         narrowShuffleMaskElts(2, RepeatedMask, WordMask);
36701 
36702       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
36703       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
36704       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
36705       PermuteImm = getV4X86ShuffleImm(WordMask);
36706       return true;
36707     }
36708   }
36709 
36710   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
36711   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
36712       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36713        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36714        (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36715     SmallVector<int, 4> RepeatedMask;
36716     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36717       ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
36718       ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
36719 
36720       // PSHUFLW: permute lower 4 elements only.
36721       if (isUndefOrInRange(LoMask, 0, 4) &&
36722           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
36723         Shuffle = X86ISD::PSHUFLW;
36724         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36725         PermuteImm = getV4X86ShuffleImm(LoMask);
36726         return true;
36727       }
36728 
36729       // PSHUFHW: permute upper 4 elements only.
36730       if (isUndefOrInRange(HiMask, 4, 8) &&
36731           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
36732         // Offset the HiMask so that we can create the shuffle immediate.
36733         int OffsetHiMask[4];
36734         for (int i = 0; i != 4; ++i)
36735           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
36736 
36737         Shuffle = X86ISD::PSHUFHW;
36738         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36739         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
36740         return true;
36741       }
36742     }
36743   }
36744 
36745   // Attempt to match against byte/bit shifts.
36746   if (AllowIntDomain &&
36747       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36748        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36749        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36750     int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
36751                                        Mask, 0, Zeroable, Subtarget);
36752     if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
36753                          32 <= ShuffleVT.getScalarSizeInBits())) {
36754       PermuteImm = (unsigned)ShiftAmt;
36755       return true;
36756     }
36757   }
36758 
36759   // Attempt to match against bit rotates.
36760   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
36761       ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
36762        Subtarget.hasAVX512())) {
36763     int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
36764                                             Subtarget, Mask);
36765     if (0 < RotateAmt) {
36766       Shuffle = X86ISD::VROTLI;
36767       PermuteImm = (unsigned)RotateAmt;
36768       return true;
36769     }
36770   }
36771 
36772   return false;
36773 }
36774 
36775 // Attempt to match a combined unary shuffle mask against supported binary
36776 // shuffle instructions.
36777 // TODO: Investigate sharing more of this with shuffle lowering.
36778 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36779                                bool AllowFloatDomain, bool AllowIntDomain,
36780                                SDValue &V1, SDValue &V2, const SDLoc &DL,
36781                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
36782                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
36783                                bool IsUnary) {
36784   unsigned NumMaskElts = Mask.size();
36785   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
36786 
36787   if (MaskVT.is128BitVector()) {
36788     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
36789       V2 = V1;
36790       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
36791       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
36792       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
36793       return true;
36794     }
36795     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
36796       V2 = V1;
36797       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
36798       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
36799       return true;
36800     }
36801     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
36802         Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
36803       std::swap(V1, V2);
36804       Shuffle = X86ISD::MOVSD;
36805       SrcVT = DstVT = MVT::v2f64;
36806       return true;
36807     }
36808     if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
36809         (AllowFloatDomain || !Subtarget.hasSSE41())) {
36810       Shuffle = X86ISD::MOVSS;
36811       SrcVT = DstVT = MVT::v4f32;
36812       return true;
36813     }
36814     if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
36815         Subtarget.hasFP16()) {
36816       Shuffle = X86ISD::MOVSH;
36817       SrcVT = DstVT = MVT::v8f16;
36818       return true;
36819     }
36820   }
36821 
36822   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
36823   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
36824       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
36825       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
36826     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
36827                              Subtarget)) {
36828       DstVT = MaskVT;
36829       return true;
36830     }
36831   }
36832 
36833   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
36834   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
36835       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36836       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
36837       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36838       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
36839     if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
36840                               Subtarget)) {
36841       SrcVT = DstVT = MaskVT;
36842       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
36843         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
36844       return true;
36845     }
36846   }
36847 
36848   // Attempt to match against a OR if we're performing a blend shuffle and the
36849   // non-blended source element is zero in each case.
36850   if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
36851       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
36852     bool IsBlend = true;
36853     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
36854     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
36855     unsigned Scale1 = NumV1Elts / NumMaskElts;
36856     unsigned Scale2 = NumV2Elts / NumMaskElts;
36857     APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
36858     APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
36859     for (unsigned i = 0; i != NumMaskElts; ++i) {
36860       int M = Mask[i];
36861       if (M == SM_SentinelUndef)
36862         continue;
36863       if (M == SM_SentinelZero) {
36864         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
36865         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
36866         continue;
36867       }
36868       if (M == (int)i) {
36869         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
36870         continue;
36871       }
36872       if (M == (int)(i + NumMaskElts)) {
36873         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
36874         continue;
36875       }
36876       IsBlend = false;
36877       break;
36878     }
36879     if (IsBlend) {
36880       if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
36881           DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
36882         Shuffle = ISD::OR;
36883         SrcVT = DstVT = MaskVT.changeTypeToInteger();
36884         return true;
36885       }
36886       if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
36887         // FIXME: handle mismatched sizes?
36888         // TODO: investigate if `ISD::OR` handling in
36889         // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
36890         auto computeKnownBitsElementWise = [&DAG](SDValue V) {
36891           unsigned NumElts = V.getValueType().getVectorNumElements();
36892           KnownBits Known(NumElts);
36893           for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
36894             APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
36895             KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
36896             if (PeepholeKnown.isZero())
36897               Known.Zero.setBit(EltIdx);
36898             if (PeepholeKnown.isAllOnes())
36899               Known.One.setBit(EltIdx);
36900           }
36901           return Known;
36902         };
36903 
36904         KnownBits V1Known = computeKnownBitsElementWise(V1);
36905         KnownBits V2Known = computeKnownBitsElementWise(V2);
36906 
36907         for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
36908           int M = Mask[i];
36909           if (M == SM_SentinelUndef)
36910             continue;
36911           if (M == SM_SentinelZero) {
36912             IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
36913             continue;
36914           }
36915           if (M == (int)i) {
36916             IsBlend &= V2Known.Zero[i] || V1Known.One[i];
36917             continue;
36918           }
36919           if (M == (int)(i + NumMaskElts)) {
36920             IsBlend &= V1Known.Zero[i] || V2Known.One[i];
36921             continue;
36922           }
36923           llvm_unreachable("will not get here.");
36924         }
36925         if (IsBlend) {
36926           Shuffle = ISD::OR;
36927           SrcVT = DstVT = MaskVT.changeTypeToInteger();
36928           return true;
36929         }
36930       }
36931     }
36932   }
36933 
36934   return false;
36935 }
36936 
36937 static bool matchBinaryPermuteShuffle(
36938     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
36939     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
36940     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
36941     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
36942   unsigned NumMaskElts = Mask.size();
36943   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
36944 
36945   // Attempt to match against VALIGND/VALIGNQ rotate.
36946   if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
36947       ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
36948        (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
36949        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36950     if (!isAnyZero(Mask)) {
36951       int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
36952       if (0 < Rotation) {
36953         Shuffle = X86ISD::VALIGN;
36954         if (EltSizeInBits == 64)
36955           ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
36956         else
36957           ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
36958         PermuteImm = Rotation;
36959         return true;
36960       }
36961     }
36962   }
36963 
36964   // Attempt to match against PALIGNR byte rotate.
36965   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36966                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36967                          (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36968     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
36969     if (0 < ByteRotation) {
36970       Shuffle = X86ISD::PALIGNR;
36971       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
36972       PermuteImm = ByteRotation;
36973       return true;
36974     }
36975   }
36976 
36977   // Attempt to combine to X86ISD::BLENDI.
36978   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
36979                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
36980       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
36981     uint64_t BlendMask = 0;
36982     bool ForceV1Zero = false, ForceV2Zero = false;
36983     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
36984     if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
36985                             ForceV2Zero, BlendMask)) {
36986       if (MaskVT == MVT::v16i16) {
36987         // We can only use v16i16 PBLENDW if the lanes are repeated.
36988         SmallVector<int, 8> RepeatedMask;
36989         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
36990                                         RepeatedMask)) {
36991           assert(RepeatedMask.size() == 8 &&
36992                  "Repeated mask size doesn't match!");
36993           PermuteImm = 0;
36994           for (int i = 0; i < 8; ++i)
36995             if (RepeatedMask[i] >= 8)
36996               PermuteImm |= 1 << i;
36997           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36998           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36999           Shuffle = X86ISD::BLENDI;
37000           ShuffleVT = MaskVT;
37001           return true;
37002         }
37003       } else {
37004         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37005         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37006         PermuteImm = (unsigned)BlendMask;
37007         Shuffle = X86ISD::BLENDI;
37008         ShuffleVT = MaskVT;
37009         return true;
37010       }
37011     }
37012   }
37013 
37014   // Attempt to combine to INSERTPS, but only if it has elements that need to
37015   // be set to zero.
37016   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37017       MaskVT.is128BitVector() && isAnyZero(Mask) &&
37018       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37019     Shuffle = X86ISD::INSERTPS;
37020     ShuffleVT = MVT::v4f32;
37021     return true;
37022   }
37023 
37024   // Attempt to combine to SHUFPD.
37025   if (AllowFloatDomain && EltSizeInBits == 64 &&
37026       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37027        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37028        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37029     bool ForceV1Zero = false, ForceV2Zero = false;
37030     if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
37031                                PermuteImm, Mask, Zeroable)) {
37032       V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37033       V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37034       Shuffle = X86ISD::SHUFP;
37035       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
37036       return true;
37037     }
37038   }
37039 
37040   // Attempt to combine to SHUFPS.
37041   if (AllowFloatDomain && EltSizeInBits == 32 &&
37042       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
37043        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37044        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37045     SmallVector<int, 4> RepeatedMask;
37046     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
37047       // Match each half of the repeated mask, to determine if its just
37048       // referencing one of the vectors, is zeroable or entirely undef.
37049       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
37050         int M0 = RepeatedMask[Offset];
37051         int M1 = RepeatedMask[Offset + 1];
37052 
37053         if (isUndefInRange(RepeatedMask, Offset, 2)) {
37054           return DAG.getUNDEF(MaskVT);
37055         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
37056           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
37057           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
37058           return getZeroVector(MaskVT, Subtarget, DAG, DL);
37059         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
37060           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37061           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37062           return V1;
37063         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
37064           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37065           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37066           return V2;
37067         }
37068 
37069         return SDValue();
37070       };
37071 
37072       int ShufMask[4] = {-1, -1, -1, -1};
37073       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
37074       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
37075 
37076       if (Lo && Hi) {
37077         V1 = Lo;
37078         V2 = Hi;
37079         Shuffle = X86ISD::SHUFP;
37080         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
37081         PermuteImm = getV4X86ShuffleImm(ShufMask);
37082         return true;
37083       }
37084     }
37085   }
37086 
37087   // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
37088   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37089       MaskVT.is128BitVector() &&
37090       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37091     Shuffle = X86ISD::INSERTPS;
37092     ShuffleVT = MVT::v4f32;
37093     return true;
37094   }
37095 
37096   return false;
37097 }
37098 
37099 static SDValue combineX86ShuffleChainWithExtract(
37100     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37101     bool HasVariableMask, bool AllowVariableCrossLaneMask,
37102     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37103     const X86Subtarget &Subtarget);
37104 
37105 /// Combine an arbitrary chain of shuffles into a single instruction if
37106 /// possible.
37107 ///
37108 /// This is the leaf of the recursive combine below. When we have found some
37109 /// chain of single-use x86 shuffle instructions and accumulated the combined
37110 /// shuffle mask represented by them, this will try to pattern match that mask
37111 /// into either a single instruction if there is a special purpose instruction
37112 /// for this operation, or into a PSHUFB instruction which is a fully general
37113 /// instruction but should only be used to replace chains over a certain depth.
37114 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
37115                                       ArrayRef<int> BaseMask, int Depth,
37116                                       bool HasVariableMask,
37117                                       bool AllowVariableCrossLaneMask,
37118                                       bool AllowVariablePerLaneMask,
37119                                       SelectionDAG &DAG,
37120                                       const X86Subtarget &Subtarget) {
37121   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
37122   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
37123          "Unexpected number of shuffle inputs!");
37124 
37125   SDLoc DL(Root);
37126   MVT RootVT = Root.getSimpleValueType();
37127   unsigned RootSizeInBits = RootVT.getSizeInBits();
37128   unsigned NumRootElts = RootVT.getVectorNumElements();
37129 
37130   // Canonicalize shuffle input op to the requested type.
37131   // TODO: Support cases where Op is smaller than VT.
37132   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
37133     if (VT.getSizeInBits() < Op.getValueSizeInBits())
37134       Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
37135     return DAG.getBitcast(VT, Op);
37136   };
37137 
37138   // Find the inputs that enter the chain. Note that multiple uses are OK
37139   // here, we're not going to remove the operands we find.
37140   bool UnaryShuffle = (Inputs.size() == 1);
37141   SDValue V1 = peekThroughBitcasts(Inputs[0]);
37142   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
37143                              : peekThroughBitcasts(Inputs[1]));
37144 
37145   MVT VT1 = V1.getSimpleValueType();
37146   MVT VT2 = V2.getSimpleValueType();
37147   assert(VT1.getSizeInBits() == RootSizeInBits &&
37148          VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
37149 
37150   SDValue Res;
37151 
37152   unsigned NumBaseMaskElts = BaseMask.size();
37153   if (NumBaseMaskElts == 1) {
37154     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
37155     return CanonicalizeShuffleInput(RootVT, V1);
37156   }
37157 
37158   bool OptForSize = DAG.shouldOptForSize();
37159   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
37160   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
37161                      (RootVT.isFloatingPoint() && Depth >= 1) ||
37162                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
37163 
37164   // Don't combine if we are a AVX512/EVEX target and the mask element size
37165   // is different from the root element size - this would prevent writemasks
37166   // from being reused.
37167   bool IsMaskedShuffle = false;
37168   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
37169     if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
37170         Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
37171       IsMaskedShuffle = true;
37172     }
37173   }
37174 
37175   // If we are shuffling a broadcast (and not introducing zeros) then
37176   // we can just use the broadcast directly. This works for smaller broadcast
37177   // elements as well as they already repeat across each mask element
37178   if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
37179       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37180       V1.getValueSizeInBits() >= RootSizeInBits) {
37181     return CanonicalizeShuffleInput(RootVT, V1);
37182   }
37183 
37184   SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
37185 
37186   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
37187   // etc. can be simplified.
37188   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
37189     SmallVector<int> ScaledMask, IdentityMask;
37190     unsigned NumElts = VT1.getVectorNumElements();
37191     if (Mask.size() <= NumElts &&
37192         scaleShuffleElements(Mask, NumElts, ScaledMask)) {
37193       for (unsigned i = 0; i != NumElts; ++i)
37194         IdentityMask.push_back(i);
37195       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
37196         return CanonicalizeShuffleInput(RootVT, V1);
37197     }
37198   }
37199 
37200   // Handle 128/256-bit lane shuffles of 512-bit vectors.
37201   if (RootVT.is512BitVector() &&
37202       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
37203     // If the upper subvectors are zeroable, then an extract+insert is more
37204     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
37205     // to zero the upper subvectors.
37206     if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
37207       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37208         return SDValue(); // Nothing to do!
37209       assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
37210              "Unexpected lane shuffle");
37211       Res = CanonicalizeShuffleInput(RootVT, V1);
37212       unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
37213       bool UseZero = isAnyZero(Mask);
37214       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
37215       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
37216     }
37217 
37218     // Narrow shuffle mask to v4x128.
37219     SmallVector<int, 4> ScaledMask;
37220     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
37221     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
37222 
37223     // Try to lower to vshuf64x2/vshuf32x4.
37224     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
37225                             ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
37226                             SelectionDAG &DAG) {
37227       unsigned PermMask = 0;
37228       // Insure elements came from the same Op.
37229       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
37230       for (int i = 0; i < 4; ++i) {
37231         assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
37232         if (ScaledMask[i] < 0)
37233           continue;
37234 
37235         SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
37236         unsigned OpIndex = i / 2;
37237         if (Ops[OpIndex].isUndef())
37238           Ops[OpIndex] = Op;
37239         else if (Ops[OpIndex] != Op)
37240           return SDValue();
37241 
37242         // Convert the 128-bit shuffle mask selection values into 128-bit
37243         // selection bits defined by a vshuf64x2 instruction's immediate control
37244         // byte.
37245         PermMask |= (ScaledMask[i] % 4) << (i * 2);
37246       }
37247 
37248       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
37249                          CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
37250                          CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
37251                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
37252     };
37253 
37254     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
37255     // doesn't work because our mask is for 128 bits and we don't have an MVT
37256     // to match that.
37257     bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
37258                        isUndefOrInRange(ScaledMask[1], 0, 2) &&
37259                        isUndefOrInRange(ScaledMask[2], 2, 4) &&
37260                        isUndefOrInRange(ScaledMask[3], 2, 4) &&
37261                        (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
37262                         ScaledMask[0] == (ScaledMask[2] % 2)) &&
37263                        (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
37264                         ScaledMask[1] == (ScaledMask[3] % 2));
37265 
37266     if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
37267       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37268         return SDValue(); // Nothing to do!
37269       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
37270       if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
37271         return DAG.getBitcast(RootVT, V);
37272     }
37273   }
37274 
37275   // Handle 128-bit lane shuffles of 256-bit vectors.
37276   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
37277     // If the upper half is zeroable, then an extract+insert is more optimal
37278     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
37279     // zero the upper half.
37280     if (isUndefOrZero(Mask[1])) {
37281       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37282         return SDValue(); // Nothing to do!
37283       assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
37284       Res = CanonicalizeShuffleInput(RootVT, V1);
37285       Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
37286       return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
37287                             256);
37288     }
37289 
37290     // If we're inserting the low subvector, an insert-subvector 'concat'
37291     // pattern is quicker than VPERM2X128.
37292     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
37293     if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
37294         !Subtarget.hasAVX2()) {
37295       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37296         return SDValue(); // Nothing to do!
37297       SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
37298       SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
37299       Hi = extractSubVector(Hi, 0, DAG, DL, 128);
37300       return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
37301     }
37302 
37303     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
37304       return SDValue(); // Nothing to do!
37305 
37306     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
37307     // we need to use the zeroing feature.
37308     // Prefer blends for sequential shuffles unless we are optimizing for size.
37309     if (UnaryShuffle &&
37310         !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
37311         (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
37312       unsigned PermMask = 0;
37313       PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
37314       PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
37315       return DAG.getNode(
37316           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
37317           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
37318     }
37319 
37320     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37321       return SDValue(); // Nothing to do!
37322 
37323     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
37324     if (!UnaryShuffle && !IsMaskedShuffle) {
37325       assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
37326              "Unexpected shuffle sentinel value");
37327       // Prefer blends to X86ISD::VPERM2X128.
37328       if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
37329         unsigned PermMask = 0;
37330         PermMask |= ((Mask[0] & 3) << 0);
37331         PermMask |= ((Mask[1] & 3) << 4);
37332         SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
37333         SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
37334         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
37335                           CanonicalizeShuffleInput(RootVT, LHS),
37336                           CanonicalizeShuffleInput(RootVT, RHS),
37337                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
37338       }
37339     }
37340   }
37341 
37342   // For masks that have been widened to 128-bit elements or more,
37343   // narrow back down to 64-bit elements.
37344   if (BaseMaskEltSizeInBits > 64) {
37345     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
37346     int MaskScale = BaseMaskEltSizeInBits / 64;
37347     SmallVector<int, 64> ScaledMask;
37348     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37349     Mask = std::move(ScaledMask);
37350   }
37351 
37352   // For masked shuffles, we're trying to match the root width for better
37353   // writemask folding, attempt to scale the mask.
37354   // TODO - variable shuffles might need this to be widened again.
37355   if (IsMaskedShuffle && NumRootElts > Mask.size()) {
37356     assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
37357     int MaskScale = NumRootElts / Mask.size();
37358     SmallVector<int, 64> ScaledMask;
37359     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37360     Mask = std::move(ScaledMask);
37361   }
37362 
37363   unsigned NumMaskElts = Mask.size();
37364   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
37365 
37366   // Determine the effective mask value type.
37367   FloatDomain &= (32 <= MaskEltSizeInBits);
37368   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
37369                            : MVT::getIntegerVT(MaskEltSizeInBits);
37370   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
37371 
37372   // Only allow legal mask types.
37373   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37374     return SDValue();
37375 
37376   // Attempt to match the mask against known shuffle patterns.
37377   MVT ShuffleSrcVT, ShuffleVT;
37378   unsigned Shuffle, PermuteImm;
37379 
37380   // Which shuffle domains are permitted?
37381   // Permit domain crossing at higher combine depths.
37382   // TODO: Should we indicate which domain is preferred if both are allowed?
37383   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
37384   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
37385                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
37386 
37387   // Determine zeroable mask elements.
37388   APInt KnownUndef, KnownZero;
37389   resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
37390   APInt Zeroable = KnownUndef | KnownZero;
37391 
37392   if (UnaryShuffle) {
37393     // Attempt to match against broadcast-from-vector.
37394     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
37395     if ((Subtarget.hasAVX2() ||
37396          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
37397         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
37398       if (isUndefOrEqual(Mask, 0)) {
37399         if (V1.getValueType() == MaskVT &&
37400             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37401             X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
37402           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37403             return SDValue(); // Nothing to do!
37404           Res = V1.getOperand(0);
37405           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37406           return DAG.getBitcast(RootVT, Res);
37407         }
37408         if (Subtarget.hasAVX2()) {
37409           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37410             return SDValue(); // Nothing to do!
37411           Res = CanonicalizeShuffleInput(MaskVT, V1);
37412           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37413           return DAG.getBitcast(RootVT, Res);
37414         }
37415       }
37416     }
37417 
37418     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
37419                           Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
37420         (!IsMaskedShuffle ||
37421          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37422       if (Depth == 0 && Root.getOpcode() == Shuffle)
37423         return SDValue(); // Nothing to do!
37424       Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37425       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
37426       return DAG.getBitcast(RootVT, Res);
37427     }
37428 
37429     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37430                                  AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
37431                                  PermuteImm) &&
37432         (!IsMaskedShuffle ||
37433          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37434       if (Depth == 0 && Root.getOpcode() == Shuffle)
37435         return SDValue(); // Nothing to do!
37436       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
37437       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
37438                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37439       return DAG.getBitcast(RootVT, Res);
37440     }
37441   }
37442 
37443   // Attempt to combine to INSERTPS, but only if the inserted element has come
37444   // from a scalar.
37445   // TODO: Handle other insertions here as well?
37446   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
37447       Subtarget.hasSSE41() &&
37448       !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
37449     if (MaskEltSizeInBits == 32) {
37450       SDValue SrcV1 = V1, SrcV2 = V2;
37451       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
37452                                  DAG) &&
37453           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37454         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37455           return SDValue(); // Nothing to do!
37456         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37457                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
37458                           CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
37459                           DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37460         return DAG.getBitcast(RootVT, Res);
37461       }
37462     }
37463     if (MaskEltSizeInBits == 64 &&
37464         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
37465         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37466         V2.getScalarValueSizeInBits() <= 32) {
37467       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37468         return SDValue(); // Nothing to do!
37469       PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
37470       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37471                         CanonicalizeShuffleInput(MVT::v4f32, V1),
37472                         CanonicalizeShuffleInput(MVT::v4f32, V2),
37473                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37474       return DAG.getBitcast(RootVT, Res);
37475     }
37476   }
37477 
37478   SDValue NewV1 = V1; // Save operands in case early exit happens.
37479   SDValue NewV2 = V2;
37480   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
37481                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
37482                          ShuffleVT, UnaryShuffle) &&
37483       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37484     if (Depth == 0 && Root.getOpcode() == Shuffle)
37485       return SDValue(); // Nothing to do!
37486     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
37487     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
37488     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
37489     return DAG.getBitcast(RootVT, Res);
37490   }
37491 
37492   NewV1 = V1; // Save operands in case early exit happens.
37493   NewV2 = V2;
37494   if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37495                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
37496                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
37497       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37498     if (Depth == 0 && Root.getOpcode() == Shuffle)
37499       return SDValue(); // Nothing to do!
37500     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
37501     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
37502     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
37503                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37504     return DAG.getBitcast(RootVT, Res);
37505   }
37506 
37507   // Typically from here on, we need an integer version of MaskVT.
37508   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
37509   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
37510 
37511   // Annoyingly, SSE4A instructions don't map into the above match helpers.
37512   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
37513     uint64_t BitLen, BitIdx;
37514     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
37515                             Zeroable)) {
37516       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
37517         return SDValue(); // Nothing to do!
37518       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
37519       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
37520                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
37521                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
37522       return DAG.getBitcast(RootVT, Res);
37523     }
37524 
37525     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
37526       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
37527         return SDValue(); // Nothing to do!
37528       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
37529       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
37530       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
37531                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
37532                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
37533       return DAG.getBitcast(RootVT, Res);
37534     }
37535   }
37536 
37537   // Match shuffle against TRUNCATE patterns.
37538   if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
37539     // Match against a VTRUNC instruction, accounting for src/dst sizes.
37540     if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
37541                              Subtarget)) {
37542       bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
37543                         ShuffleSrcVT.getVectorNumElements();
37544       unsigned Opc =
37545           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
37546       if (Depth == 0 && Root.getOpcode() == Opc)
37547         return SDValue(); // Nothing to do!
37548       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37549       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
37550       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
37551         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
37552       return DAG.getBitcast(RootVT, Res);
37553     }
37554 
37555     // Do we need a more general binary truncation pattern?
37556     if (RootSizeInBits < 512 &&
37557         ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
37558          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
37559         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
37560         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
37561       if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
37562         return SDValue(); // Nothing to do!
37563       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
37564       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
37565       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37566       V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
37567       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
37568       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
37569       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
37570       Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
37571       return DAG.getBitcast(RootVT, Res);
37572     }
37573   }
37574 
37575   // Don't try to re-form single instruction chains under any circumstances now
37576   // that we've done encoding canonicalization for them.
37577   if (Depth < 1)
37578     return SDValue();
37579 
37580   // Depth threshold above which we can efficiently use variable mask shuffles.
37581   int VariableCrossLaneShuffleDepth =
37582       Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
37583   int VariablePerLaneShuffleDepth =
37584       Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
37585   AllowVariableCrossLaneMask &=
37586       (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
37587   AllowVariablePerLaneMask &=
37588       (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
37589   // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
37590   // higher depth before combining them.
37591   bool AllowBWIVPERMV3 =
37592       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
37593 
37594   bool MaskContainsZeros = isAnyZero(Mask);
37595 
37596   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
37597     // If we have a single input lane-crossing shuffle then lower to VPERMV.
37598     if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
37599       if (Subtarget.hasAVX2() &&
37600           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
37601         SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
37602         Res = CanonicalizeShuffleInput(MaskVT, V1);
37603         Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
37604         return DAG.getBitcast(RootVT, Res);
37605       }
37606       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
37607       if ((Subtarget.hasAVX512() &&
37608            (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37609             MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
37610           (Subtarget.hasBWI() &&
37611            (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37612           (Subtarget.hasVBMI() &&
37613            (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
37614         V1 = CanonicalizeShuffleInput(MaskVT, V1);
37615         V2 = DAG.getUNDEF(MaskVT);
37616         Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37617         return DAG.getBitcast(RootVT, Res);
37618       }
37619     }
37620 
37621     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
37622     // vector as the second source (non-VLX will pad to 512-bit shuffles).
37623     if (UnaryShuffle && AllowVariableCrossLaneMask &&
37624         ((Subtarget.hasAVX512() &&
37625           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37626            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
37627            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
37628            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
37629          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37630           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37631          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37632           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
37633       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
37634       for (unsigned i = 0; i != NumMaskElts; ++i)
37635         if (Mask[i] == SM_SentinelZero)
37636           Mask[i] = NumMaskElts + i;
37637       V1 = CanonicalizeShuffleInput(MaskVT, V1);
37638       V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
37639       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37640       return DAG.getBitcast(RootVT, Res);
37641     }
37642 
37643     // If that failed and either input is extracted then try to combine as a
37644     // shuffle with the larger type.
37645     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
37646             Inputs, Root, BaseMask, Depth, HasVariableMask,
37647             AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
37648             Subtarget))
37649       return WideShuffle;
37650 
37651     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
37652     // (non-VLX will pad to 512-bit shuffles).
37653     if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
37654         ((Subtarget.hasAVX512() &&
37655           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
37656            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
37657            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
37658            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
37659          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37660           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
37661          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37662           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
37663       V1 = CanonicalizeShuffleInput(MaskVT, V1);
37664       V2 = CanonicalizeShuffleInput(MaskVT, V2);
37665       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37666       return DAG.getBitcast(RootVT, Res);
37667     }
37668     return SDValue();
37669   }
37670 
37671   // See if we can combine a single input shuffle with zeros to a bit-mask,
37672   // which is much simpler than any shuffle.
37673   if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
37674       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
37675       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
37676     APInt Zero = APInt::getZero(MaskEltSizeInBits);
37677     APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
37678     APInt UndefElts(NumMaskElts, 0);
37679     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
37680     for (unsigned i = 0; i != NumMaskElts; ++i) {
37681       int M = Mask[i];
37682       if (M == SM_SentinelUndef) {
37683         UndefElts.setBit(i);
37684         continue;
37685       }
37686       if (M == SM_SentinelZero)
37687         continue;
37688       EltBits[i] = AllOnes;
37689     }
37690     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
37691     Res = CanonicalizeShuffleInput(MaskVT, V1);
37692     unsigned AndOpcode =
37693         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
37694     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
37695     return DAG.getBitcast(RootVT, Res);
37696   }
37697 
37698   // If we have a single input shuffle with different shuffle patterns in the
37699   // the 128-bit lanes use the variable mask to VPERMILPS.
37700   // TODO Combine other mask types at higher depths.
37701   if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37702       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
37703        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
37704     SmallVector<SDValue, 16> VPermIdx;
37705     for (int M : Mask) {
37706       SDValue Idx =
37707           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
37708       VPermIdx.push_back(Idx);
37709     }
37710     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
37711     Res = CanonicalizeShuffleInput(MaskVT, V1);
37712     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
37713     return DAG.getBitcast(RootVT, Res);
37714   }
37715 
37716   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
37717   // to VPERMIL2PD/VPERMIL2PS.
37718   if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
37719       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
37720        MaskVT == MVT::v8f32)) {
37721     // VPERMIL2 Operation.
37722     // Bits[3] - Match Bit.
37723     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
37724     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
37725     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
37726     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
37727     SmallVector<int, 8> VPerm2Idx;
37728     unsigned M2ZImm = 0;
37729     for (int M : Mask) {
37730       if (M == SM_SentinelUndef) {
37731         VPerm2Idx.push_back(-1);
37732         continue;
37733       }
37734       if (M == SM_SentinelZero) {
37735         M2ZImm = 2;
37736         VPerm2Idx.push_back(8);
37737         continue;
37738       }
37739       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
37740       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
37741       VPerm2Idx.push_back(Index);
37742     }
37743     V1 = CanonicalizeShuffleInput(MaskVT, V1);
37744     V2 = CanonicalizeShuffleInput(MaskVT, V2);
37745     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
37746     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
37747                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
37748     return DAG.getBitcast(RootVT, Res);
37749   }
37750 
37751   // If we have 3 or more shuffle instructions or a chain involving a variable
37752   // mask, we can replace them with a single PSHUFB instruction profitably.
37753   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
37754   // instructions, but in practice PSHUFB tends to be *very* fast so we're
37755   // more aggressive.
37756   if (UnaryShuffle && AllowVariablePerLaneMask &&
37757       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37758        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
37759        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
37760     SmallVector<SDValue, 16> PSHUFBMask;
37761     int NumBytes = RootVT.getSizeInBits() / 8;
37762     int Ratio = NumBytes / NumMaskElts;
37763     for (int i = 0; i < NumBytes; ++i) {
37764       int M = Mask[i / Ratio];
37765       if (M == SM_SentinelUndef) {
37766         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
37767         continue;
37768       }
37769       if (M == SM_SentinelZero) {
37770         PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
37771         continue;
37772       }
37773       M = Ratio * M + i % Ratio;
37774       assert((M / 16) == (i / 16) && "Lane crossing detected");
37775       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
37776     }
37777     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
37778     Res = CanonicalizeShuffleInput(ByteVT, V1);
37779     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
37780     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
37781     return DAG.getBitcast(RootVT, Res);
37782   }
37783 
37784   // With XOP, if we have a 128-bit binary input shuffle we can always combine
37785   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
37786   // slower than PSHUFB on targets that support both.
37787   if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
37788       Subtarget.hasXOP()) {
37789     // VPPERM Mask Operation
37790     // Bits[4:0] - Byte Index (0 - 31)
37791     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
37792     SmallVector<SDValue, 16> VPPERMMask;
37793     int NumBytes = 16;
37794     int Ratio = NumBytes / NumMaskElts;
37795     for (int i = 0; i < NumBytes; ++i) {
37796       int M = Mask[i / Ratio];
37797       if (M == SM_SentinelUndef) {
37798         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
37799         continue;
37800       }
37801       if (M == SM_SentinelZero) {
37802         VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
37803         continue;
37804       }
37805       M = Ratio * M + i % Ratio;
37806       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
37807     }
37808     MVT ByteVT = MVT::v16i8;
37809     V1 = CanonicalizeShuffleInput(ByteVT, V1);
37810     V2 = CanonicalizeShuffleInput(ByteVT, V2);
37811     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
37812     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
37813     return DAG.getBitcast(RootVT, Res);
37814   }
37815 
37816   // If that failed and either input is extracted then try to combine as a
37817   // shuffle with the larger type.
37818   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
37819           Inputs, Root, BaseMask, Depth, HasVariableMask,
37820           AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
37821     return WideShuffle;
37822 
37823   // If we have a dual input shuffle then lower to VPERMV3,
37824   // (non-VLX will pad to 512-bit shuffles)
37825   if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37826       ((Subtarget.hasAVX512() &&
37827         (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
37828          MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
37829          MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
37830          MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
37831          MaskVT == MVT::v16i32)) ||
37832        (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37833         (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
37834          MaskVT == MVT::v32i16)) ||
37835        (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37836         (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
37837          MaskVT == MVT::v64i8)))) {
37838     V1 = CanonicalizeShuffleInput(MaskVT, V1);
37839     V2 = CanonicalizeShuffleInput(MaskVT, V2);
37840     Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37841     return DAG.getBitcast(RootVT, Res);
37842   }
37843 
37844   // Failed to find any combines.
37845   return SDValue();
37846 }
37847 
37848 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
37849 // instruction if possible.
37850 //
37851 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
37852 // type size to attempt to combine:
37853 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
37854 // -->
37855 // extract_subvector(shuffle(x,y,m2),0)
37856 static SDValue combineX86ShuffleChainWithExtract(
37857     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37858     bool HasVariableMask, bool AllowVariableCrossLaneMask,
37859     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37860     const X86Subtarget &Subtarget) {
37861   unsigned NumMaskElts = BaseMask.size();
37862   unsigned NumInputs = Inputs.size();
37863   if (NumInputs == 0)
37864     return SDValue();
37865 
37866   EVT RootVT = Root.getValueType();
37867   unsigned RootSizeInBits = RootVT.getSizeInBits();
37868   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
37869 
37870   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
37871   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
37872 
37873   // Peek through subvectors.
37874   // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
37875   unsigned WideSizeInBits = RootSizeInBits;
37876   for (unsigned i = 0; i != NumInputs; ++i) {
37877     SDValue &Src = WideInputs[i];
37878     unsigned &Offset = Offsets[i];
37879     Src = peekThroughBitcasts(Src);
37880     EVT BaseVT = Src.getValueType();
37881     while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
37882       Offset += Src.getConstantOperandVal(1);
37883       Src = Src.getOperand(0);
37884     }
37885     WideSizeInBits = std::max(WideSizeInBits,
37886                               (unsigned)Src.getValueSizeInBits());
37887     assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
37888            "Unexpected subvector extraction");
37889     Offset /= BaseVT.getVectorNumElements();
37890     Offset *= NumMaskElts;
37891   }
37892 
37893   // Bail if we're always extracting from the lowest subvectors,
37894   // combineX86ShuffleChain should match this for the current width.
37895   if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
37896     return SDValue();
37897 
37898   unsigned Scale = WideSizeInBits / RootSizeInBits;
37899   assert((WideSizeInBits % RootSizeInBits) == 0 &&
37900          "Unexpected subvector extraction");
37901 
37902   // If the src vector types aren't the same, see if we can extend
37903   // them to match each other.
37904   // TODO: Support different scalar types?
37905   EVT WideSVT = WideInputs[0].getValueType().getScalarType();
37906   if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
37907         return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
37908                Op.getValueType().getScalarType() != WideSVT;
37909       }))
37910     return SDValue();
37911 
37912   for (SDValue &NewInput : WideInputs) {
37913     assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
37914            "Shuffle vector size mismatch");
37915     if (WideSizeInBits > NewInput.getValueSizeInBits())
37916       NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
37917                                 SDLoc(NewInput), WideSizeInBits);
37918     assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
37919            "Unexpected subvector extraction");
37920   }
37921 
37922   // Create new mask for larger type.
37923   for (unsigned i = 1; i != NumInputs; ++i)
37924     Offsets[i] += i * Scale * NumMaskElts;
37925 
37926   SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
37927   for (int &M : WideMask) {
37928     if (M < 0)
37929       continue;
37930     M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
37931   }
37932   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
37933 
37934   // Remove unused/repeated shuffle source ops.
37935   resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
37936   assert(!WideInputs.empty() && "Shuffle with no inputs detected");
37937 
37938   if (WideInputs.size() > 2)
37939     return SDValue();
37940 
37941   // Increase depth for every upper subvector we've peeked through.
37942   Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
37943 
37944   // Attempt to combine wider chain.
37945   // TODO: Can we use a better Root?
37946   SDValue WideRoot = WideInputs[0];
37947   if (SDValue WideShuffle =
37948           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
37949                                  HasVariableMask, AllowVariableCrossLaneMask,
37950                                  AllowVariablePerLaneMask, DAG, Subtarget)) {
37951     WideShuffle =
37952         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
37953     return DAG.getBitcast(RootVT, WideShuffle);
37954   }
37955   return SDValue();
37956 }
37957 
37958 // Canonicalize the combined shuffle mask chain with horizontal ops.
37959 // NOTE: This may update the Ops and Mask.
37960 static SDValue canonicalizeShuffleMaskWithHorizOp(
37961     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
37962     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
37963     const X86Subtarget &Subtarget) {
37964   if (Mask.empty() || Ops.empty())
37965     return SDValue();
37966 
37967   SmallVector<SDValue> BC;
37968   for (SDValue Op : Ops)
37969     BC.push_back(peekThroughBitcasts(Op));
37970 
37971   // All ops must be the same horizop + type.
37972   SDValue BC0 = BC[0];
37973   EVT VT0 = BC0.getValueType();
37974   unsigned Opcode0 = BC0.getOpcode();
37975   if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
37976         return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
37977       }))
37978     return SDValue();
37979 
37980   bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
37981                   Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
37982   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
37983   if (!isHoriz && !isPack)
37984     return SDValue();
37985 
37986   // Do all ops have a single use?
37987   bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
37988     return Op.hasOneUse() &&
37989            peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
37990   });
37991 
37992   int NumElts = VT0.getVectorNumElements();
37993   int NumLanes = VT0.getSizeInBits() / 128;
37994   int NumEltsPerLane = NumElts / NumLanes;
37995   int NumHalfEltsPerLane = NumEltsPerLane / 2;
37996   MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
37997   unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37998 
37999   if (NumEltsPerLane >= 4 &&
38000       (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
38001     SmallVector<int> LaneMask, ScaledMask;
38002     if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
38003         scaleShuffleElements(LaneMask, 4, ScaledMask)) {
38004       // See if we can remove the shuffle by resorting the HOP chain so that
38005       // the HOP args are pre-shuffled.
38006       // TODO: Generalize to any sized/depth chain.
38007       // TODO: Add support for PACKSS/PACKUS.
38008       if (isHoriz) {
38009         // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
38010         auto GetHOpSrc = [&](int M) {
38011           if (M == SM_SentinelUndef)
38012             return DAG.getUNDEF(VT0);
38013           if (M == SM_SentinelZero)
38014             return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
38015           SDValue Src0 = BC[M / 4];
38016           SDValue Src1 = Src0.getOperand((M % 4) >= 2);
38017           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
38018             return Src1.getOperand(M % 2);
38019           return SDValue();
38020         };
38021         SDValue M0 = GetHOpSrc(ScaledMask[0]);
38022         SDValue M1 = GetHOpSrc(ScaledMask[1]);
38023         SDValue M2 = GetHOpSrc(ScaledMask[2]);
38024         SDValue M3 = GetHOpSrc(ScaledMask[3]);
38025         if (M0 && M1 && M2 && M3) {
38026           SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
38027           SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
38028           return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38029         }
38030       }
38031       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
38032       if (Ops.size() >= 2) {
38033         SDValue LHS, RHS;
38034         auto GetHOpSrc = [&](int M, int &OutM) {
38035           // TODO: Support SM_SentinelZero
38036           if (M < 0)
38037             return M == SM_SentinelUndef;
38038           SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
38039           if (!LHS || LHS == Src) {
38040             LHS = Src;
38041             OutM = (M % 2);
38042             return true;
38043           }
38044           if (!RHS || RHS == Src) {
38045             RHS = Src;
38046             OutM = (M % 2) + 2;
38047             return true;
38048           }
38049           return false;
38050         };
38051         int PostMask[4] = {-1, -1, -1, -1};
38052         if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
38053             GetHOpSrc(ScaledMask[1], PostMask[1]) &&
38054             GetHOpSrc(ScaledMask[2], PostMask[2]) &&
38055             GetHOpSrc(ScaledMask[3], PostMask[3])) {
38056           LHS = DAG.getBitcast(SrcVT, LHS);
38057           RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
38058           SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38059           // Use SHUFPS for the permute so this will work on SSE3 targets,
38060           // shuffle combining and domain handling will simplify this later on.
38061           MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
38062           Res = DAG.getBitcast(ShuffleVT, Res);
38063           return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
38064                              getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
38065         }
38066       }
38067     }
38068   }
38069 
38070   if (2 < Ops.size())
38071     return SDValue();
38072 
38073   SDValue BC1 = BC[BC.size() - 1];
38074   if (Mask.size() == VT0.getVectorNumElements()) {
38075     // Canonicalize binary shuffles of horizontal ops that use the
38076     // same sources to an unary shuffle.
38077     // TODO: Try to perform this fold even if the shuffle remains.
38078     if (Ops.size() == 2) {
38079       auto ContainsOps = [](SDValue HOp, SDValue Op) {
38080         return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
38081       };
38082       // Commute if all BC0's ops are contained in BC1.
38083       if (ContainsOps(BC1, BC0.getOperand(0)) &&
38084           ContainsOps(BC1, BC0.getOperand(1))) {
38085         ShuffleVectorSDNode::commuteMask(Mask);
38086         std::swap(Ops[0], Ops[1]);
38087         std::swap(BC0, BC1);
38088       }
38089 
38090       // If BC1 can be represented by BC0, then convert to unary shuffle.
38091       if (ContainsOps(BC0, BC1.getOperand(0)) &&
38092           ContainsOps(BC0, BC1.getOperand(1))) {
38093         for (int &M : Mask) {
38094           if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
38095             continue;
38096           int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
38097           M -= NumElts + (SubLane * NumHalfEltsPerLane);
38098           if (BC1.getOperand(SubLane) != BC0.getOperand(0))
38099             M += NumHalfEltsPerLane;
38100         }
38101       }
38102     }
38103 
38104     // Canonicalize unary horizontal ops to only refer to lower halves.
38105     for (int i = 0; i != NumElts; ++i) {
38106       int &M = Mask[i];
38107       if (isUndefOrZero(M))
38108         continue;
38109       if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
38110           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38111         M -= NumHalfEltsPerLane;
38112       if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
38113           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38114         M -= NumHalfEltsPerLane;
38115     }
38116   }
38117 
38118   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
38119   // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
38120   // represents the LHS/RHS inputs for the lower/upper halves.
38121   SmallVector<int, 16> TargetMask128, WideMask128;
38122   if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
38123       scaleShuffleElements(TargetMask128, 2, WideMask128)) {
38124     assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
38125     bool SingleOp = (Ops.size() == 1);
38126     if (isPack || OneUseOps ||
38127         shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
38128       SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
38129       SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
38130       Lo = Lo.getOperand(WideMask128[0] & 1);
38131       Hi = Hi.getOperand(WideMask128[1] & 1);
38132       if (SingleOp) {
38133         SDValue Undef = DAG.getUNDEF(SrcVT);
38134         SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
38135         Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
38136         Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
38137         Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
38138         Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
38139       }
38140       return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
38141     }
38142   }
38143 
38144   return SDValue();
38145 }
38146 
38147 // Attempt to constant fold all of the constant source ops.
38148 // Returns true if the entire shuffle is folded to a constant.
38149 // TODO: Extend this to merge multiple constant Ops and update the mask.
38150 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
38151                                            ArrayRef<int> Mask, SDValue Root,
38152                                            bool HasVariableMask,
38153                                            SelectionDAG &DAG,
38154                                            const X86Subtarget &Subtarget) {
38155   MVT VT = Root.getSimpleValueType();
38156 
38157   unsigned SizeInBits = VT.getSizeInBits();
38158   unsigned NumMaskElts = Mask.size();
38159   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
38160   unsigned NumOps = Ops.size();
38161 
38162   // Extract constant bits from each source op.
38163   bool OneUseConstantOp = false;
38164   SmallVector<APInt, 16> UndefEltsOps(NumOps);
38165   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
38166   for (unsigned i = 0; i != NumOps; ++i) {
38167     SDValue SrcOp = Ops[i];
38168     OneUseConstantOp |= SrcOp.hasOneUse();
38169     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
38170                                        RawBitsOps[i]))
38171       return SDValue();
38172   }
38173 
38174   // If we're optimizing for size, only fold if at least one of the constants is
38175   // only used once or the combined shuffle has included a variable mask
38176   // shuffle, this is to avoid constant pool bloat.
38177   bool IsOptimizingSize = DAG.shouldOptForSize();
38178   if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)
38179     return SDValue();
38180 
38181   // Shuffle the constant bits according to the mask.
38182   SDLoc DL(Root);
38183   APInt UndefElts(NumMaskElts, 0);
38184   APInt ZeroElts(NumMaskElts, 0);
38185   APInt ConstantElts(NumMaskElts, 0);
38186   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
38187                                         APInt::getZero(MaskSizeInBits));
38188   for (unsigned i = 0; i != NumMaskElts; ++i) {
38189     int M = Mask[i];
38190     if (M == SM_SentinelUndef) {
38191       UndefElts.setBit(i);
38192       continue;
38193     } else if (M == SM_SentinelZero) {
38194       ZeroElts.setBit(i);
38195       continue;
38196     }
38197     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
38198 
38199     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
38200     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
38201 
38202     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
38203     if (SrcUndefElts[SrcMaskIdx]) {
38204       UndefElts.setBit(i);
38205       continue;
38206     }
38207 
38208     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
38209     APInt &Bits = SrcEltBits[SrcMaskIdx];
38210     if (!Bits) {
38211       ZeroElts.setBit(i);
38212       continue;
38213     }
38214 
38215     ConstantElts.setBit(i);
38216     ConstantBitData[i] = Bits;
38217   }
38218   assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
38219 
38220   // Attempt to create a zero vector.
38221   if ((UndefElts | ZeroElts).isAllOnes())
38222     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
38223 
38224   // Create the constant data.
38225   MVT MaskSVT;
38226   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
38227     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
38228   else
38229     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
38230 
38231   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
38232   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
38233     return SDValue();
38234 
38235   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
38236   return DAG.getBitcast(VT, CstOp);
38237 }
38238 
38239 namespace llvm {
38240   namespace X86 {
38241     enum {
38242       MaxShuffleCombineDepth = 8
38243     };
38244   }
38245 } // namespace llvm
38246 
38247 /// Fully generic combining of x86 shuffle instructions.
38248 ///
38249 /// This should be the last combine run over the x86 shuffle instructions. Once
38250 /// they have been fully optimized, this will recursively consider all chains
38251 /// of single-use shuffle instructions, build a generic model of the cumulative
38252 /// shuffle operation, and check for simpler instructions which implement this
38253 /// operation. We use this primarily for two purposes:
38254 ///
38255 /// 1) Collapse generic shuffles to specialized single instructions when
38256 ///    equivalent. In most cases, this is just an encoding size win, but
38257 ///    sometimes we will collapse multiple generic shuffles into a single
38258 ///    special-purpose shuffle.
38259 /// 2) Look for sequences of shuffle instructions with 3 or more total
38260 ///    instructions, and replace them with the slightly more expensive SSSE3
38261 ///    PSHUFB instruction if available. We do this as the last combining step
38262 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
38263 ///    a suitable short sequence of other instructions. The PSHUFB will either
38264 ///    use a register or have to read from memory and so is slightly (but only
38265 ///    slightly) more expensive than the other shuffle instructions.
38266 ///
38267 /// Because this is inherently a quadratic operation (for each shuffle in
38268 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
38269 /// This should never be an issue in practice as the shuffle lowering doesn't
38270 /// produce sequences of more than 8 instructions.
38271 ///
38272 /// FIXME: We will currently miss some cases where the redundant shuffling
38273 /// would simplify under the threshold for PSHUFB formation because of
38274 /// combine-ordering. To fix this, we should do the redundant instruction
38275 /// combining in this recursive walk.
38276 static SDValue combineX86ShufflesRecursively(
38277     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
38278     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
38279     unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
38280     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38281     const X86Subtarget &Subtarget) {
38282   assert(RootMask.size() > 0 &&
38283          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
38284          "Illegal shuffle root mask");
38285   assert(Root.getSimpleValueType().isVector() &&
38286          "Shuffles operate on vector types!");
38287   unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
38288 
38289   // Bound the depth of our recursive combine because this is ultimately
38290   // quadratic in nature.
38291   if (Depth >= MaxDepth)
38292     return SDValue();
38293 
38294   // Directly rip through bitcasts to find the underlying operand.
38295   SDValue Op = SrcOps[SrcOpIndex];
38296   Op = peekThroughOneUseBitcasts(Op);
38297 
38298   EVT VT = Op.getValueType();
38299   if (!VT.isVector() || !VT.isSimple())
38300     return SDValue(); // Bail if we hit a non-simple non-vector.
38301 
38302   // FIXME: Just bail on f16 for now.
38303   if (VT.getVectorElementType() == MVT::f16)
38304     return SDValue();
38305 
38306   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
38307          "Can only combine shuffles upto size of the root op.");
38308 
38309   // Extract target shuffle mask and resolve sentinels and inputs.
38310   // TODO - determine Op's demanded elts from RootMask.
38311   SmallVector<int, 64> OpMask;
38312   SmallVector<SDValue, 2> OpInputs;
38313   APInt OpUndef, OpZero;
38314   APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
38315   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
38316   if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
38317                               OpZero, DAG, Depth, false))
38318     return SDValue();
38319 
38320   // Shuffle inputs must not be larger than the shuffle result.
38321   // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
38322   if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
38323         return OpInput.getValueSizeInBits() > VT.getSizeInBits();
38324       }))
38325     return SDValue();
38326 
38327   // If the shuffle result was smaller than the root, we need to adjust the
38328   // mask indices and pad the mask with undefs.
38329   if (RootSizeInBits > VT.getSizeInBits()) {
38330     unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
38331     unsigned OpMaskSize = OpMask.size();
38332     if (OpInputs.size() > 1) {
38333       unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
38334       for (int &M : OpMask) {
38335         if (M < 0)
38336           continue;
38337         int EltIdx = M % OpMaskSize;
38338         int OpIdx = M / OpMaskSize;
38339         M = (PaddedMaskSize * OpIdx) + EltIdx;
38340       }
38341     }
38342     OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
38343     OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
38344     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
38345   }
38346 
38347   SmallVector<int, 64> Mask;
38348   SmallVector<SDValue, 16> Ops;
38349 
38350   // We don't need to merge masks if the root is empty.
38351   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
38352   if (EmptyRoot) {
38353     // Only resolve zeros if it will remove an input, otherwise we might end
38354     // up in an infinite loop.
38355     bool ResolveKnownZeros = true;
38356     if (!OpZero.isZero()) {
38357       APInt UsedInputs = APInt::getZero(OpInputs.size());
38358       for (int i = 0, e = OpMask.size(); i != e; ++i) {
38359         int M = OpMask[i];
38360         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
38361           continue;
38362         UsedInputs.setBit(M / OpMask.size());
38363         if (UsedInputs.isAllOnes()) {
38364           ResolveKnownZeros = false;
38365           break;
38366         }
38367       }
38368     }
38369     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
38370                                       ResolveKnownZeros);
38371 
38372     Mask = OpMask;
38373     Ops.append(OpInputs.begin(), OpInputs.end());
38374   } else {
38375     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
38376 
38377     // Add the inputs to the Ops list, avoiding duplicates.
38378     Ops.append(SrcOps.begin(), SrcOps.end());
38379 
38380     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
38381       // Attempt to find an existing match.
38382       SDValue InputBC = peekThroughBitcasts(Input);
38383       for (int i = 0, e = Ops.size(); i < e; ++i)
38384         if (InputBC == peekThroughBitcasts(Ops[i]))
38385           return i;
38386       // Match failed - should we replace an existing Op?
38387       if (InsertionPoint >= 0) {
38388         Ops[InsertionPoint] = Input;
38389         return InsertionPoint;
38390       }
38391       // Add to the end of the Ops list.
38392       Ops.push_back(Input);
38393       return Ops.size() - 1;
38394     };
38395 
38396     SmallVector<int, 2> OpInputIdx;
38397     for (SDValue OpInput : OpInputs)
38398       OpInputIdx.push_back(
38399           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
38400 
38401     assert(((RootMask.size() > OpMask.size() &&
38402              RootMask.size() % OpMask.size() == 0) ||
38403             (OpMask.size() > RootMask.size() &&
38404              OpMask.size() % RootMask.size() == 0) ||
38405             OpMask.size() == RootMask.size()) &&
38406            "The smaller number of elements must divide the larger.");
38407 
38408     // This function can be performance-critical, so we rely on the power-of-2
38409     // knowledge that we have about the mask sizes to replace div/rem ops with
38410     // bit-masks and shifts.
38411     assert(isPowerOf2_32(RootMask.size()) &&
38412            "Non-power-of-2 shuffle mask sizes");
38413     assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
38414     unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
38415     unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
38416 
38417     unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
38418     unsigned RootRatio =
38419         std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
38420     unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
38421     assert((RootRatio == 1 || OpRatio == 1) &&
38422            "Must not have a ratio for both incoming and op masks!");
38423 
38424     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
38425     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
38426     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
38427     unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
38428     unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
38429 
38430     Mask.resize(MaskWidth, SM_SentinelUndef);
38431 
38432     // Merge this shuffle operation's mask into our accumulated mask. Note that
38433     // this shuffle's mask will be the first applied to the input, followed by
38434     // the root mask to get us all the way to the root value arrangement. The
38435     // reason for this order is that we are recursing up the operation chain.
38436     for (unsigned i = 0; i < MaskWidth; ++i) {
38437       unsigned RootIdx = i >> RootRatioLog2;
38438       if (RootMask[RootIdx] < 0) {
38439         // This is a zero or undef lane, we're done.
38440         Mask[i] = RootMask[RootIdx];
38441         continue;
38442       }
38443 
38444       unsigned RootMaskedIdx =
38445           RootRatio == 1
38446               ? RootMask[RootIdx]
38447               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
38448 
38449       // Just insert the scaled root mask value if it references an input other
38450       // than the SrcOp we're currently inserting.
38451       if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
38452           (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
38453         Mask[i] = RootMaskedIdx;
38454         continue;
38455       }
38456 
38457       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
38458       unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
38459       if (OpMask[OpIdx] < 0) {
38460         // The incoming lanes are zero or undef, it doesn't matter which ones we
38461         // are using.
38462         Mask[i] = OpMask[OpIdx];
38463         continue;
38464       }
38465 
38466       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
38467       unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
38468                                           : (OpMask[OpIdx] << OpRatioLog2) +
38469                                                 (RootMaskedIdx & (OpRatio - 1));
38470 
38471       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
38472       int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
38473       assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
38474       OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
38475 
38476       Mask[i] = OpMaskedIdx;
38477     }
38478   }
38479 
38480   // Remove unused/repeated shuffle source ops.
38481   resolveTargetShuffleInputsAndMask(Ops, Mask);
38482 
38483   // Handle the all undef/zero/ones cases early.
38484   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
38485     return DAG.getUNDEF(Root.getValueType());
38486   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
38487     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
38488                          SDLoc(Root));
38489   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
38490       none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
38491     return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
38492 
38493   assert(!Ops.empty() && "Shuffle with no inputs detected");
38494   HasVariableMask |= IsOpVariableMask;
38495 
38496   // Update the list of shuffle nodes that have been combined so far.
38497   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
38498                                                 SrcNodes.end());
38499   CombinedNodes.push_back(Op.getNode());
38500 
38501   // See if we can recurse into each shuffle source op (if it's a target
38502   // shuffle). The source op should only be generally combined if it either has
38503   // a single use (i.e. current Op) or all its users have already been combined,
38504   // if not then we can still combine but should prevent generation of variable
38505   // shuffles to avoid constant pool bloat.
38506   // Don't recurse if we already have more source ops than we can combine in
38507   // the remaining recursion depth.
38508   if (Ops.size() < (MaxDepth - Depth)) {
38509     for (int i = 0, e = Ops.size(); i < e; ++i) {
38510       // For empty roots, we need to resolve zeroable elements before combining
38511       // them with other shuffles.
38512       SmallVector<int, 64> ResolvedMask = Mask;
38513       if (EmptyRoot)
38514         resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
38515       bool AllowCrossLaneVar = false;
38516       bool AllowPerLaneVar = false;
38517       if (Ops[i].getNode()->hasOneUse() ||
38518           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
38519         AllowCrossLaneVar = AllowVariableCrossLaneMask;
38520         AllowPerLaneVar = AllowVariablePerLaneMask;
38521       }
38522       if (SDValue Res = combineX86ShufflesRecursively(
38523               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
38524               HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
38525               Subtarget))
38526         return Res;
38527     }
38528   }
38529 
38530   // Attempt to constant fold all of the constant source ops.
38531   if (SDValue Cst = combineX86ShufflesConstants(
38532           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
38533     return Cst;
38534 
38535   // If constant fold failed and we only have constants - then we have
38536   // multiple uses by a single non-variable shuffle - just bail.
38537   if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
38538         APInt UndefElts;
38539         SmallVector<APInt> RawBits;
38540         unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38541         return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
38542                                              RawBits);
38543       })) {
38544     return SDValue();
38545   }
38546 
38547   // Canonicalize the combined shuffle mask chain with horizontal ops.
38548   // NOTE: This will update the Ops and Mask.
38549   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
38550           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
38551     return DAG.getBitcast(Root.getValueType(), HOp);
38552 
38553   // Try to refine our inputs given our knowledge of target shuffle mask.
38554   for (auto I : enumerate(Ops)) {
38555     int OpIdx = I.index();
38556     SDValue &Op = I.value();
38557 
38558     // What range of shuffle mask element values results in picking from Op?
38559     int Lo = OpIdx * Mask.size();
38560     int Hi = Lo + Mask.size();
38561 
38562     // Which elements of Op do we demand, given the mask's granularity?
38563     APInt OpDemandedElts(Mask.size(), 0);
38564     for (int MaskElt : Mask) {
38565       if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
38566         int OpEltIdx = MaskElt - Lo;
38567         OpDemandedElts.setBit(OpEltIdx);
38568       }
38569     }
38570 
38571     // Is the shuffle result smaller than the root?
38572     if (Op.getValueSizeInBits() < RootSizeInBits) {
38573       // We padded the mask with undefs. But we now need to undo that.
38574       unsigned NumExpectedVectorElts = Mask.size();
38575       unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
38576       unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
38577       assert(!OpDemandedElts.extractBits(
38578                  NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
38579              "Demanding the virtual undef widening padding?");
38580       OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
38581     }
38582 
38583     // The Op itself may be of different VT, so we need to scale the mask.
38584     unsigned NumOpElts = Op.getValueType().getVectorNumElements();
38585     APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
38586 
38587     // Can this operand be simplified any further, given it's demanded elements?
38588     if (SDValue NewOp =
38589             DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
38590                 Op, OpScaledDemandedElts, DAG))
38591       Op = NewOp;
38592   }
38593   // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
38594 
38595   // Widen any subvector shuffle inputs we've collected.
38596   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
38597         return Op.getValueSizeInBits() < RootSizeInBits;
38598       })) {
38599     for (SDValue &Op : Ops)
38600       if (Op.getValueSizeInBits() < RootSizeInBits)
38601         Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
38602                             RootSizeInBits);
38603     // Reresolve - we might have repeated subvector sources.
38604     resolveTargetShuffleInputsAndMask(Ops, Mask);
38605   }
38606 
38607   // We can only combine unary and binary shuffle mask cases.
38608   if (Ops.size() <= 2) {
38609     // Minor canonicalization of the accumulated shuffle mask to make it easier
38610     // to match below. All this does is detect masks with sequential pairs of
38611     // elements, and shrink them to the half-width mask. It does this in a loop
38612     // so it will reduce the size of the mask to the minimal width mask which
38613     // performs an equivalent shuffle.
38614     while (Mask.size() > 1) {
38615       SmallVector<int, 64> WidenedMask;
38616       if (!canWidenShuffleElements(Mask, WidenedMask))
38617         break;
38618       Mask = std::move(WidenedMask);
38619     }
38620 
38621     // Canonicalization of binary shuffle masks to improve pattern matching by
38622     // commuting the inputs.
38623     if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
38624       ShuffleVectorSDNode::commuteMask(Mask);
38625       std::swap(Ops[0], Ops[1]);
38626     }
38627 
38628     // Finally, try to combine into a single shuffle instruction.
38629     return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
38630                                   AllowVariableCrossLaneMask,
38631                                   AllowVariablePerLaneMask, DAG, Subtarget);
38632   }
38633 
38634   // If that failed and any input is extracted then try to combine as a
38635   // shuffle with the larger type.
38636   return combineX86ShuffleChainWithExtract(
38637       Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
38638       AllowVariablePerLaneMask, DAG, Subtarget);
38639 }
38640 
38641 /// Helper entry wrapper to combineX86ShufflesRecursively.
38642 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
38643                                              const X86Subtarget &Subtarget) {
38644   return combineX86ShufflesRecursively(
38645       {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
38646       /*HasVarMask*/ false,
38647       /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
38648       Subtarget);
38649 }
38650 
38651 /// Get the PSHUF-style mask from PSHUF node.
38652 ///
38653 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
38654 /// PSHUF-style masks that can be reused with such instructions.
38655 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
38656   MVT VT = N.getSimpleValueType();
38657   SmallVector<int, 4> Mask;
38658   SmallVector<SDValue, 2> Ops;
38659   bool HaveMask =
38660       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
38661   (void)HaveMask;
38662   assert(HaveMask);
38663 
38664   // If we have more than 128-bits, only the low 128-bits of shuffle mask
38665   // matter. Check that the upper masks are repeats and remove them.
38666   if (VT.getSizeInBits() > 128) {
38667     int LaneElts = 128 / VT.getScalarSizeInBits();
38668 #ifndef NDEBUG
38669     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
38670       for (int j = 0; j < LaneElts; ++j)
38671         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
38672                "Mask doesn't repeat in high 128-bit lanes!");
38673 #endif
38674     Mask.resize(LaneElts);
38675   }
38676 
38677   switch (N.getOpcode()) {
38678   case X86ISD::PSHUFD:
38679     return Mask;
38680   case X86ISD::PSHUFLW:
38681     Mask.resize(4);
38682     return Mask;
38683   case X86ISD::PSHUFHW:
38684     Mask.erase(Mask.begin(), Mask.begin() + 4);
38685     for (int &M : Mask)
38686       M -= 4;
38687     return Mask;
38688   default:
38689     llvm_unreachable("No valid shuffle instruction found!");
38690   }
38691 }
38692 
38693 /// Search for a combinable shuffle across a chain ending in pshufd.
38694 ///
38695 /// We walk up the chain and look for a combinable shuffle, skipping over
38696 /// shuffles that we could hoist this shuffle's transformation past without
38697 /// altering anything.
38698 static SDValue
38699 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
38700                              SelectionDAG &DAG) {
38701   assert(N.getOpcode() == X86ISD::PSHUFD &&
38702          "Called with something other than an x86 128-bit half shuffle!");
38703   SDLoc DL(N);
38704 
38705   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
38706   // of the shuffles in the chain so that we can form a fresh chain to replace
38707   // this one.
38708   SmallVector<SDValue, 8> Chain;
38709   SDValue V = N.getOperand(0);
38710   for (; V.hasOneUse(); V = V.getOperand(0)) {
38711     switch (V.getOpcode()) {
38712     default:
38713       return SDValue(); // Nothing combined!
38714 
38715     case ISD::BITCAST:
38716       // Skip bitcasts as we always know the type for the target specific
38717       // instructions.
38718       continue;
38719 
38720     case X86ISD::PSHUFD:
38721       // Found another dword shuffle.
38722       break;
38723 
38724     case X86ISD::PSHUFLW:
38725       // Check that the low words (being shuffled) are the identity in the
38726       // dword shuffle, and the high words are self-contained.
38727       if (Mask[0] != 0 || Mask[1] != 1 ||
38728           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
38729         return SDValue();
38730 
38731       Chain.push_back(V);
38732       continue;
38733 
38734     case X86ISD::PSHUFHW:
38735       // Check that the high words (being shuffled) are the identity in the
38736       // dword shuffle, and the low words are self-contained.
38737       if (Mask[2] != 2 || Mask[3] != 3 ||
38738           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
38739         return SDValue();
38740 
38741       Chain.push_back(V);
38742       continue;
38743 
38744     case X86ISD::UNPCKL:
38745     case X86ISD::UNPCKH:
38746       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
38747       // shuffle into a preceding word shuffle.
38748       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
38749           V.getSimpleValueType().getVectorElementType() != MVT::i16)
38750         return SDValue();
38751 
38752       // Search for a half-shuffle which we can combine with.
38753       unsigned CombineOp =
38754           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
38755       if (V.getOperand(0) != V.getOperand(1) ||
38756           !V->isOnlyUserOf(V.getOperand(0).getNode()))
38757         return SDValue();
38758       Chain.push_back(V);
38759       V = V.getOperand(0);
38760       do {
38761         switch (V.getOpcode()) {
38762         default:
38763           return SDValue(); // Nothing to combine.
38764 
38765         case X86ISD::PSHUFLW:
38766         case X86ISD::PSHUFHW:
38767           if (V.getOpcode() == CombineOp)
38768             break;
38769 
38770           Chain.push_back(V);
38771 
38772           LLVM_FALLTHROUGH;
38773         case ISD::BITCAST:
38774           V = V.getOperand(0);
38775           continue;
38776         }
38777         break;
38778       } while (V.hasOneUse());
38779       break;
38780     }
38781     // Break out of the loop if we break out of the switch.
38782     break;
38783   }
38784 
38785   if (!V.hasOneUse())
38786     // We fell out of the loop without finding a viable combining instruction.
38787     return SDValue();
38788 
38789   // Merge this node's mask and our incoming mask.
38790   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38791   for (int &M : Mask)
38792     M = VMask[M];
38793   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
38794                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
38795 
38796   // Rebuild the chain around this new shuffle.
38797   while (!Chain.empty()) {
38798     SDValue W = Chain.pop_back_val();
38799 
38800     if (V.getValueType() != W.getOperand(0).getValueType())
38801       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
38802 
38803     switch (W.getOpcode()) {
38804     default:
38805       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
38806 
38807     case X86ISD::UNPCKL:
38808     case X86ISD::UNPCKH:
38809       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
38810       break;
38811 
38812     case X86ISD::PSHUFD:
38813     case X86ISD::PSHUFLW:
38814     case X86ISD::PSHUFHW:
38815       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
38816       break;
38817     }
38818   }
38819   if (V.getValueType() != N.getValueType())
38820     V = DAG.getBitcast(N.getValueType(), V);
38821 
38822   // Return the new chain to replace N.
38823   return V;
38824 }
38825 
38826 // Attempt to commute shufps LHS loads:
38827 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
38828 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
38829                                       SelectionDAG &DAG) {
38830   // TODO: Add vXf64 support.
38831   if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
38832     return SDValue();
38833 
38834   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
38835   auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
38836     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
38837       return SDValue();
38838     SDValue N0 = V.getOperand(0);
38839     SDValue N1 = V.getOperand(1);
38840     unsigned Imm = V.getConstantOperandVal(2);
38841     const X86Subtarget &Subtarget =
38842         static_cast<const X86Subtarget &>(DAG.getSubtarget());
38843     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
38844         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
38845       return SDValue();
38846     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
38847     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
38848                        DAG.getTargetConstant(Imm, DL, MVT::i8));
38849   };
38850 
38851   switch (N.getOpcode()) {
38852   case X86ISD::VPERMILPI:
38853     if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
38854       unsigned Imm = N.getConstantOperandVal(1);
38855       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
38856                          DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
38857     }
38858     break;
38859   case X86ISD::SHUFP: {
38860     SDValue N0 = N.getOperand(0);
38861     SDValue N1 = N.getOperand(1);
38862     unsigned Imm = N.getConstantOperandVal(2);
38863     if (N0 == N1) {
38864       if (SDValue NewSHUFP = commuteSHUFP(N, N0))
38865         return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
38866                            DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
38867     } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
38868       return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
38869                          DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
38870     } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
38871       return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
38872                          DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
38873     }
38874     break;
38875   }
38876   }
38877 
38878   return SDValue();
38879 }
38880 
38881 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
38882 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
38883                                              const SDLoc &DL) {
38884   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38885   EVT ShuffleVT = N.getValueType();
38886 
38887   auto IsMergeableWithShuffle = [](SDValue Op) {
38888     // AllZeros/AllOnes constants are freely shuffled and will peek through
38889     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
38890     // merge with target shuffles if it has one use so shuffle combining is
38891     // likely to kick in.
38892     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
38893            ISD::isBuildVectorAllZeros(Op.getNode()) ||
38894            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
38895            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
38896            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
38897   };
38898   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
38899     // Ensure we only shuffle whole vector src elements, unless its a logical
38900     // binops where we can more aggressively move shuffles from dst to src.
38901     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
38902            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
38903   };
38904 
38905   unsigned Opc = N.getOpcode();
38906   switch (Opc) {
38907   // Unary and Unary+Permute Shuffles.
38908   case X86ISD::PSHUFB: {
38909     // Don't merge PSHUFB if it contains zero'd elements.
38910     SmallVector<int> Mask;
38911     SmallVector<SDValue> Ops;
38912     if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
38913                               Mask))
38914       break;
38915     LLVM_FALLTHROUGH;
38916   }
38917   case X86ISD::VBROADCAST:
38918   case X86ISD::MOVDDUP:
38919   case X86ISD::PSHUFD:
38920   case X86ISD::PSHUFHW:
38921   case X86ISD::PSHUFLW:
38922   case X86ISD::VPERMI:
38923   case X86ISD::VPERMILPI: {
38924     if (N.getOperand(0).getValueType() == ShuffleVT &&
38925         N->isOnlyUserOf(N.getOperand(0).getNode())) {
38926       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
38927       unsigned SrcOpcode = N0.getOpcode();
38928       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
38929         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
38930         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
38931         if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
38932           SDValue LHS, RHS;
38933           Op00 = DAG.getBitcast(ShuffleVT, Op00);
38934           Op01 = DAG.getBitcast(ShuffleVT, Op01);
38935           if (N.getNumOperands() == 2) {
38936             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
38937             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
38938           } else {
38939             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
38940             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
38941           }
38942           EVT OpVT = N0.getValueType();
38943           return DAG.getBitcast(ShuffleVT,
38944                                 DAG.getNode(SrcOpcode, DL, OpVT,
38945                                             DAG.getBitcast(OpVT, LHS),
38946                                             DAG.getBitcast(OpVT, RHS)));
38947         }
38948       }
38949     }
38950     break;
38951   }
38952   // Binary and Binary+Permute Shuffles.
38953   case X86ISD::INSERTPS: {
38954     // Don't merge INSERTPS if it contains zero'd elements.
38955     unsigned InsertPSMask = N.getConstantOperandVal(2);
38956     unsigned ZeroMask = InsertPSMask & 0xF;
38957     if (ZeroMask != 0)
38958       break;
38959     LLVM_FALLTHROUGH;
38960   }
38961   case X86ISD::MOVSD:
38962   case X86ISD::MOVSS:
38963   case X86ISD::BLENDI:
38964   case X86ISD::SHUFP:
38965   case X86ISD::UNPCKH:
38966   case X86ISD::UNPCKL: {
38967     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
38968         N->isOnlyUserOf(N.getOperand(1).getNode())) {
38969       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
38970       SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
38971       unsigned SrcOpcode = N0.getOpcode();
38972       if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
38973           IsSafeToMoveShuffle(N0, SrcOpcode) &&
38974           IsSafeToMoveShuffle(N1, SrcOpcode)) {
38975         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
38976         SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
38977         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
38978         SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
38979         // Ensure the total number of shuffles doesn't increase by folding this
38980         // shuffle through to the source ops.
38981         if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
38982              (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
38983             ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
38984              (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
38985           SDValue LHS, RHS;
38986           Op00 = DAG.getBitcast(ShuffleVT, Op00);
38987           Op10 = DAG.getBitcast(ShuffleVT, Op10);
38988           Op01 = DAG.getBitcast(ShuffleVT, Op01);
38989           Op11 = DAG.getBitcast(ShuffleVT, Op11);
38990           if (N.getNumOperands() == 3) {
38991             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
38992             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
38993           } else {
38994             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
38995             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
38996           }
38997           EVT OpVT = N0.getValueType();
38998           return DAG.getBitcast(ShuffleVT,
38999                                 DAG.getNode(SrcOpcode, DL, OpVT,
39000                                             DAG.getBitcast(OpVT, LHS),
39001                                             DAG.getBitcast(OpVT, RHS)));
39002         }
39003       }
39004     }
39005     break;
39006   }
39007   }
39008   return SDValue();
39009 }
39010 
39011 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
39012 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
39013                                                       SelectionDAG &DAG,
39014                                                       const SDLoc &DL) {
39015   assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
39016 
39017   MVT VT = V.getSimpleValueType();
39018   SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
39019   SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
39020   unsigned SrcOpc0 = Src0.getOpcode();
39021   unsigned SrcOpc1 = Src1.getOpcode();
39022   EVT SrcVT0 = Src0.getValueType();
39023   EVT SrcVT1 = Src1.getValueType();
39024 
39025   if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
39026     return SDValue();
39027 
39028   switch (SrcOpc0) {
39029   case X86ISD::MOVDDUP: {
39030     SDValue LHS = Src0.getOperand(0);
39031     SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39032     SDValue Res =
39033         DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
39034     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
39035     return DAG.getBitcast(VT, Res);
39036   }
39037   case X86ISD::VPERMILPI:
39038     // TODO: Handle v4f64 permutes with different low/high lane masks.
39039     if (SrcVT0 == MVT::v4f64) {
39040       uint64_t Mask = Src0.getConstantOperandVal(1);
39041       if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
39042         break;
39043     }
39044     LLVM_FALLTHROUGH;
39045   case X86ISD::VSHLI:
39046   case X86ISD::VSRLI:
39047   case X86ISD::VSRAI:
39048   case X86ISD::PSHUFD:
39049     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
39050       SDValue LHS = Src0.getOperand(0);
39051       SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39052       SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
39053                                 V.getOperand(2));
39054       Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
39055       return DAG.getBitcast(VT, Res);
39056     }
39057     break;
39058   }
39059 
39060   return SDValue();
39061 }
39062 
39063 /// Try to combine x86 target specific shuffles.
39064 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
39065                                     TargetLowering::DAGCombinerInfo &DCI,
39066                                     const X86Subtarget &Subtarget) {
39067   SDLoc DL(N);
39068   MVT VT = N.getSimpleValueType();
39069   SmallVector<int, 4> Mask;
39070   unsigned Opcode = N.getOpcode();
39071 
39072   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
39073     return R;
39074 
39075   // Handle specific target shuffles.
39076   switch (Opcode) {
39077   case X86ISD::MOVDDUP: {
39078     SDValue Src = N.getOperand(0);
39079     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
39080     if (VT == MVT::v2f64 && Src.hasOneUse() &&
39081         ISD::isNormalLoad(Src.getNode())) {
39082       LoadSDNode *LN = cast<LoadSDNode>(Src);
39083       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
39084         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
39085         DCI.CombineTo(N.getNode(), Movddup);
39086         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39087         DCI.recursivelyDeleteUnusedNodes(LN);
39088         return N; // Return N so it doesn't get rechecked!
39089       }
39090     }
39091 
39092     return SDValue();
39093   }
39094   case X86ISD::VBROADCAST: {
39095     SDValue Src = N.getOperand(0);
39096     SDValue BC = peekThroughBitcasts(Src);
39097     EVT SrcVT = Src.getValueType();
39098     EVT BCVT = BC.getValueType();
39099 
39100     // If broadcasting from another shuffle, attempt to simplify it.
39101     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
39102     if (isTargetShuffle(BC.getOpcode()) &&
39103         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
39104       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
39105       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
39106                                         SM_SentinelUndef);
39107       for (unsigned i = 0; i != Scale; ++i)
39108         DemandedMask[i] = i;
39109       if (SDValue Res = combineX86ShufflesRecursively(
39110               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
39111               X86::MaxShuffleCombineDepth,
39112               /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
39113               /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
39114         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39115                            DAG.getBitcast(SrcVT, Res));
39116     }
39117 
39118     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
39119     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
39120     if (Src.getOpcode() == ISD::BITCAST &&
39121         SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
39122         DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
39123         FixedVectorType::isValidElementType(
39124             BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
39125       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
39126                                    VT.getVectorNumElements());
39127       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39128     }
39129 
39130     // Reduce broadcast source vector to lowest 128-bits.
39131     if (SrcVT.getSizeInBits() > 128)
39132       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39133                          extract128BitVector(Src, 0, DAG, DL));
39134 
39135     // broadcast(scalar_to_vector(x)) -> broadcast(x).
39136     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
39137       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39138 
39139     // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
39140     if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39141         isNullConstant(Src.getOperand(1)) &&
39142         DAG.getTargetLoweringInfo().isTypeLegal(
39143             Src.getOperand(0).getValueType()))
39144       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39145 
39146     // Share broadcast with the longest vector and extract low subvector (free).
39147     // Ensure the same SDValue from the SDNode use is being used.
39148     for (SDNode *User : Src->uses())
39149       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
39150           Src == User->getOperand(0) &&
39151           User->getValueSizeInBits(0).getFixedSize() >
39152               VT.getFixedSizeInBits()) {
39153         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
39154                                 VT.getSizeInBits());
39155       }
39156 
39157     // vbroadcast(scalarload X) -> vbroadcast_load X
39158     // For float loads, extract other uses of the scalar from the broadcast.
39159     if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
39160         ISD::isNormalLoad(Src.getNode())) {
39161       LoadSDNode *LN = cast<LoadSDNode>(Src);
39162       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39163       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39164       SDValue BcastLd =
39165           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39166                                   LN->getMemoryVT(), LN->getMemOperand());
39167       // If the load value is used only by N, replace it via CombineTo N.
39168       bool NoReplaceExtract = Src.hasOneUse();
39169       DCI.CombineTo(N.getNode(), BcastLd);
39170       if (NoReplaceExtract) {
39171         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39172         DCI.recursivelyDeleteUnusedNodes(LN);
39173       } else {
39174         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
39175                                   DAG.getIntPtrConstant(0, DL));
39176         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
39177       }
39178       return N; // Return N so it doesn't get rechecked!
39179     }
39180 
39181     // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
39182     // i16. So shrink it ourselves if we can make a broadcast_load.
39183     if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
39184         Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
39185       assert(Subtarget.hasAVX2() && "Expected AVX2");
39186       SDValue TruncIn = Src.getOperand(0);
39187 
39188       // If this is a truncate of a non extending load we can just narrow it to
39189       // use a broadcast_load.
39190       if (ISD::isNormalLoad(TruncIn.getNode())) {
39191         LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
39192         // Unless its volatile or atomic.
39193         if (LN->isSimple()) {
39194           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39195           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39196           SDValue BcastLd = DAG.getMemIntrinsicNode(
39197               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39198               LN->getPointerInfo(), LN->getOriginalAlign(),
39199               LN->getMemOperand()->getFlags());
39200           DCI.CombineTo(N.getNode(), BcastLd);
39201           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39202           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39203           return N; // Return N so it doesn't get rechecked!
39204         }
39205       }
39206 
39207       // If this is a truncate of an i16 extload, we can directly replace it.
39208       if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
39209           ISD::isEXTLoad(Src.getOperand(0).getNode())) {
39210         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
39211         if (LN->getMemoryVT().getSizeInBits() == 16) {
39212           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39213           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39214           SDValue BcastLd =
39215               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39216                                       LN->getMemoryVT(), LN->getMemOperand());
39217           DCI.CombineTo(N.getNode(), BcastLd);
39218           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39219           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39220           return N; // Return N so it doesn't get rechecked!
39221         }
39222       }
39223 
39224       // If this is a truncate of load that has been shifted right, we can
39225       // offset the pointer and use a narrower load.
39226       if (TruncIn.getOpcode() == ISD::SRL &&
39227           TruncIn.getOperand(0).hasOneUse() &&
39228           isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
39229           ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
39230         LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
39231         unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
39232         // Make sure the shift amount and the load size are divisible by 16.
39233         // Don't do this if the load is volatile or atomic.
39234         if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
39235             LN->isSimple()) {
39236           unsigned Offset = ShiftAmt / 8;
39237           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39238           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
39239                                                  TypeSize::Fixed(Offset), DL);
39240           SDValue Ops[] = { LN->getChain(), Ptr };
39241           SDValue BcastLd = DAG.getMemIntrinsicNode(
39242               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39243               LN->getPointerInfo().getWithOffset(Offset),
39244               LN->getOriginalAlign(),
39245               LN->getMemOperand()->getFlags());
39246           DCI.CombineTo(N.getNode(), BcastLd);
39247           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39248           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39249           return N; // Return N so it doesn't get rechecked!
39250         }
39251       }
39252     }
39253 
39254     // vbroadcast(vzload X) -> vbroadcast_load X
39255     if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
39256       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
39257       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
39258         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39259         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39260         SDValue BcastLd =
39261             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39262                                     LN->getMemoryVT(), LN->getMemOperand());
39263         DCI.CombineTo(N.getNode(), BcastLd);
39264         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39265         DCI.recursivelyDeleteUnusedNodes(LN);
39266         return N; // Return N so it doesn't get rechecked!
39267       }
39268     }
39269 
39270     // vbroadcast(vector load X) -> vbroadcast_load
39271     if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
39272          SrcVT == MVT::v4i32) &&
39273         Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
39274       LoadSDNode *LN = cast<LoadSDNode>(Src);
39275       // Unless the load is volatile or atomic.
39276       if (LN->isSimple()) {
39277         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39278         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39279         SDValue BcastLd = DAG.getMemIntrinsicNode(
39280             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
39281             LN->getPointerInfo(), LN->getOriginalAlign(),
39282             LN->getMemOperand()->getFlags());
39283         DCI.CombineTo(N.getNode(), BcastLd);
39284         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39285         DCI.recursivelyDeleteUnusedNodes(LN);
39286         return N; // Return N so it doesn't get rechecked!
39287       }
39288     }
39289 
39290     return SDValue();
39291   }
39292   case X86ISD::VZEXT_MOVL: {
39293     SDValue N0 = N.getOperand(0);
39294 
39295     // If this a vzmovl of a full vector load, replace it with a vzload, unless
39296     // the load is volatile.
39297     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
39298       auto *LN = cast<LoadSDNode>(N0);
39299       if (SDValue VZLoad =
39300               narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
39301         DCI.CombineTo(N.getNode(), VZLoad);
39302         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39303         DCI.recursivelyDeleteUnusedNodes(LN);
39304         return N;
39305       }
39306     }
39307 
39308     // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
39309     // and can just use a VZEXT_LOAD.
39310     // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
39311     if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
39312       auto *LN = cast<MemSDNode>(N0);
39313       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
39314         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39315         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39316         SDValue VZLoad =
39317             DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
39318                                     LN->getMemoryVT(), LN->getMemOperand());
39319         DCI.CombineTo(N.getNode(), VZLoad);
39320         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39321         DCI.recursivelyDeleteUnusedNodes(LN);
39322         return N;
39323       }
39324     }
39325 
39326     // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
39327     // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
39328     // if the upper bits of the i64 are zero.
39329     if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39330         N0.getOperand(0).hasOneUse() &&
39331         N0.getOperand(0).getValueType() == MVT::i64) {
39332       SDValue In = N0.getOperand(0);
39333       APInt Mask = APInt::getHighBitsSet(64, 32);
39334       if (DAG.MaskedValueIsZero(In, Mask)) {
39335         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
39336         MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
39337         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
39338         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
39339         return DAG.getBitcast(VT, Movl);
39340       }
39341     }
39342 
39343     // Load a scalar integer constant directly to XMM instead of transferring an
39344     // immediate value from GPR.
39345     // vzext_movl (scalar_to_vector C) --> load [C,0...]
39346     if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39347       if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
39348         // Create a vector constant - scalar constant followed by zeros.
39349         EVT ScalarVT = N0.getOperand(0).getValueType();
39350         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
39351         unsigned NumElts = VT.getVectorNumElements();
39352         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
39353         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
39354         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
39355 
39356         // Load the vector constant from constant pool.
39357         MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
39358         SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
39359         MachinePointerInfo MPI =
39360             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
39361         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
39362         return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
39363                            MachineMemOperand::MOLoad);
39364       }
39365     }
39366 
39367     // Pull subvector inserts into undef through VZEXT_MOVL by making it an
39368     // insert into a zero vector. This helps get VZEXT_MOVL closer to
39369     // scalar_to_vectors where 256/512 are canonicalized to an insert and a
39370     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
39371     if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
39372       SDValue V = peekThroughOneUseBitcasts(N0);
39373 
39374       if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
39375           isNullConstant(V.getOperand(2))) {
39376         SDValue In = V.getOperand(1);
39377         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
39378                                      In.getValueSizeInBits() /
39379                                          VT.getScalarSizeInBits());
39380         In = DAG.getBitcast(SubVT, In);
39381         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
39382         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
39383                            getZeroVector(VT, Subtarget, DAG, DL), Movl,
39384                            V.getOperand(2));
39385       }
39386     }
39387 
39388     return SDValue();
39389   }
39390   case X86ISD::BLENDI: {
39391     SDValue N0 = N.getOperand(0);
39392     SDValue N1 = N.getOperand(1);
39393 
39394     // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
39395     // TODO: Handle MVT::v16i16 repeated blend mask.
39396     if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
39397         N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
39398       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
39399       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
39400           SrcVT.getScalarSizeInBits() >= 32) {
39401         unsigned BlendMask = N.getConstantOperandVal(2);
39402         unsigned Size = VT.getVectorNumElements();
39403         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
39404         BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
39405         return DAG.getBitcast(
39406             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
39407                             N1.getOperand(0),
39408                             DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
39409       }
39410     }
39411     return SDValue();
39412   }
39413   case X86ISD::SHUFP: {
39414     // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
39415     // This is a more relaxed shuffle combiner that can ignore oneuse limits.
39416     // TODO: Support types other than v4f32.
39417     if (VT == MVT::v4f32) {
39418       bool Updated = false;
39419       SmallVector<int> Mask;
39420       SmallVector<SDValue> Ops;
39421       if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
39422           Ops.size() == 2) {
39423         for (int i = 0; i != 2; ++i) {
39424           SmallVector<SDValue> SubOps;
39425           SmallVector<int> SubMask, SubScaledMask;
39426           SDValue Sub = peekThroughBitcasts(Ops[i]);
39427           // TODO: Scaling might be easier if we specify the demanded elts.
39428           if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
39429               scaleShuffleElements(SubMask, 4, SubScaledMask) &&
39430               SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
39431             int Ofs = i * 2;
39432             Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
39433             Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
39434             Ops[i] = DAG.getBitcast(VT, SubOps[0]);
39435             Updated = true;
39436           }
39437         }
39438       }
39439       if (Updated) {
39440         for (int &M : Mask)
39441           M %= 4;
39442         Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39443         return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
39444       }
39445     }
39446     return SDValue();
39447   }
39448   case X86ISD::VPERMI: {
39449     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
39450     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
39451     SDValue N0 = N.getOperand(0);
39452     SDValue N1 = N.getOperand(1);
39453     unsigned EltSizeInBits = VT.getScalarSizeInBits();
39454     if (N0.getOpcode() == ISD::BITCAST &&
39455         N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
39456       SDValue Src = N0.getOperand(0);
39457       EVT SrcVT = Src.getValueType();
39458       SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
39459       return DAG.getBitcast(VT, Res);
39460     }
39461     return SDValue();
39462   }
39463   case X86ISD::VPERM2X128: {
39464     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
39465     SDValue LHS = N->getOperand(0);
39466     SDValue RHS = N->getOperand(1);
39467     if (LHS.getOpcode() == ISD::BITCAST &&
39468         (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
39469       EVT SrcVT = LHS.getOperand(0).getValueType();
39470       if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
39471         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
39472                                               DAG.getBitcast(SrcVT, LHS),
39473                                               DAG.getBitcast(SrcVT, RHS),
39474                                               N->getOperand(2)));
39475       }
39476     }
39477 
39478     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
39479     if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
39480       return Res;
39481 
39482     // Fold vperm2x128 subvector shuffle with an inner concat pattern.
39483     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
39484     auto FindSubVector128 = [&](unsigned Idx) {
39485       if (Idx > 3)
39486         return SDValue();
39487       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
39488       SmallVector<SDValue> SubOps;
39489       if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
39490         return SubOps[Idx & 1];
39491       unsigned NumElts = Src.getValueType().getVectorNumElements();
39492       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
39493           Src.getOperand(1).getValueSizeInBits() == 128 &&
39494           Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
39495         return Src.getOperand(1);
39496       }
39497       return SDValue();
39498     };
39499     unsigned Imm = N.getConstantOperandVal(2);
39500     if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
39501       if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
39502         MVT SubVT = VT.getHalfNumVectorElementsVT();
39503         SubLo = DAG.getBitcast(SubVT, SubLo);
39504         SubHi = DAG.getBitcast(SubVT, SubHi);
39505         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
39506       }
39507     }
39508     return SDValue();
39509   }
39510   case X86ISD::PSHUFD:
39511   case X86ISD::PSHUFLW:
39512   case X86ISD::PSHUFHW:
39513     Mask = getPSHUFShuffleMask(N);
39514     assert(Mask.size() == 4);
39515     break;
39516   case X86ISD::MOVSD:
39517   case X86ISD::MOVSH:
39518   case X86ISD::MOVSS: {
39519     SDValue N0 = N.getOperand(0);
39520     SDValue N1 = N.getOperand(1);
39521 
39522     // Canonicalize scalar FPOps:
39523     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
39524     // If commutable, allow OP(N1[0], N0[0]).
39525     unsigned Opcode1 = N1.getOpcode();
39526     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
39527         Opcode1 == ISD::FDIV) {
39528       SDValue N10 = N1.getOperand(0);
39529       SDValue N11 = N1.getOperand(1);
39530       if (N10 == N0 ||
39531           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
39532         if (N10 != N0)
39533           std::swap(N10, N11);
39534         MVT SVT = VT.getVectorElementType();
39535         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
39536         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
39537         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
39538         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
39539         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
39540         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
39541       }
39542     }
39543 
39544     return SDValue();
39545   }
39546   case X86ISD::INSERTPS: {
39547     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
39548     SDValue Op0 = N.getOperand(0);
39549     SDValue Op1 = N.getOperand(1);
39550     unsigned InsertPSMask = N.getConstantOperandVal(2);
39551     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
39552     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
39553     unsigned ZeroMask = InsertPSMask & 0xF;
39554 
39555     // If we zero out all elements from Op0 then we don't need to reference it.
39556     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
39557       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
39558                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39559 
39560     // If we zero out the element from Op1 then we don't need to reference it.
39561     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
39562       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
39563                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39564 
39565     // Attempt to merge insertps Op1 with an inner target shuffle node.
39566     SmallVector<int, 8> TargetMask1;
39567     SmallVector<SDValue, 2> Ops1;
39568     APInt KnownUndef1, KnownZero1;
39569     if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
39570                                      KnownZero1)) {
39571       if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
39572         // Zero/UNDEF insertion - zero out element and remove dependency.
39573         InsertPSMask |= (1u << DstIdx);
39574         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
39575                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39576       }
39577       // Update insertps mask srcidx and reference the source input directly.
39578       int M = TargetMask1[SrcIdx];
39579       assert(0 <= M && M < 8 && "Shuffle index out of range");
39580       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
39581       Op1 = Ops1[M < 4 ? 0 : 1];
39582       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
39583                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39584     }
39585 
39586     // Attempt to merge insertps Op0 with an inner target shuffle node.
39587     SmallVector<int, 8> TargetMask0;
39588     SmallVector<SDValue, 2> Ops0;
39589     APInt KnownUndef0, KnownZero0;
39590     if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
39591                                      KnownZero0)) {
39592       bool Updated = false;
39593       bool UseInput00 = false;
39594       bool UseInput01 = false;
39595       for (int i = 0; i != 4; ++i) {
39596         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
39597           // No change if element is already zero or the inserted element.
39598           continue;
39599         } else if (KnownUndef0[i] || KnownZero0[i]) {
39600           // If the target mask is undef/zero then we must zero the element.
39601           InsertPSMask |= (1u << i);
39602           Updated = true;
39603           continue;
39604         }
39605 
39606         // The input vector element must be inline.
39607         int M = TargetMask0[i];
39608         if (M != i && M != (i + 4))
39609           return SDValue();
39610 
39611         // Determine which inputs of the target shuffle we're using.
39612         UseInput00 |= (0 <= M && M < 4);
39613         UseInput01 |= (4 <= M);
39614       }
39615 
39616       // If we're not using both inputs of the target shuffle then use the
39617       // referenced input directly.
39618       if (UseInput00 && !UseInput01) {
39619         Updated = true;
39620         Op0 = Ops0[0];
39621       } else if (!UseInput00 && UseInput01) {
39622         Updated = true;
39623         Op0 = Ops0[1];
39624       }
39625 
39626       if (Updated)
39627         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
39628                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
39629     }
39630 
39631     // If we're inserting an element from a vbroadcast load, fold the
39632     // load into the X86insertps instruction. We need to convert the scalar
39633     // load to a vector and clear the source lane of the INSERTPS control.
39634     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
39635       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
39636       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
39637         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
39638                                    MemIntr->getBasePtr(),
39639                                    MemIntr->getMemOperand());
39640         SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
39641                            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
39642                                        Load),
39643                            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
39644         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
39645         return Insert;
39646       }
39647     }
39648 
39649     return SDValue();
39650   }
39651   default:
39652     return SDValue();
39653   }
39654 
39655   // Nuke no-op shuffles that show up after combining.
39656   if (isNoopShuffleMask(Mask))
39657     return N.getOperand(0);
39658 
39659   // Look for simplifications involving one or two shuffle instructions.
39660   SDValue V = N.getOperand(0);
39661   switch (N.getOpcode()) {
39662   default:
39663     break;
39664   case X86ISD::PSHUFLW:
39665   case X86ISD::PSHUFHW:
39666     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
39667 
39668     // See if this reduces to a PSHUFD which is no more expensive and can
39669     // combine with more operations. Note that it has to at least flip the
39670     // dwords as otherwise it would have been removed as a no-op.
39671     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
39672       int DMask[] = {0, 1, 2, 3};
39673       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
39674       DMask[DOffset + 0] = DOffset + 1;
39675       DMask[DOffset + 1] = DOffset + 0;
39676       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
39677       V = DAG.getBitcast(DVT, V);
39678       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
39679                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
39680       return DAG.getBitcast(VT, V);
39681     }
39682 
39683     // Look for shuffle patterns which can be implemented as a single unpack.
39684     // FIXME: This doesn't handle the location of the PSHUFD generically, and
39685     // only works when we have a PSHUFD followed by two half-shuffles.
39686     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
39687         (V.getOpcode() == X86ISD::PSHUFLW ||
39688          V.getOpcode() == X86ISD::PSHUFHW) &&
39689         V.getOpcode() != N.getOpcode() &&
39690         V.hasOneUse() && V.getOperand(0).hasOneUse()) {
39691       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
39692       if (D.getOpcode() == X86ISD::PSHUFD) {
39693         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39694         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
39695         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
39696         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
39697         int WordMask[8];
39698         for (int i = 0; i < 4; ++i) {
39699           WordMask[i + NOffset] = Mask[i] + NOffset;
39700           WordMask[i + VOffset] = VMask[i] + VOffset;
39701         }
39702         // Map the word mask through the DWord mask.
39703         int MappedMask[8];
39704         for (int i = 0; i < 8; ++i)
39705           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
39706         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
39707             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
39708           // We can replace all three shuffles with an unpack.
39709           V = DAG.getBitcast(VT, D.getOperand(0));
39710           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
39711                                                 : X86ISD::UNPCKH,
39712                              DL, VT, V, V);
39713         }
39714       }
39715     }
39716 
39717     break;
39718 
39719   case X86ISD::PSHUFD:
39720     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
39721       return NewN;
39722 
39723     break;
39724   }
39725 
39726   return SDValue();
39727 }
39728 
39729 /// Checks if the shuffle mask takes subsequent elements
39730 /// alternately from two vectors.
39731 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
39732 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
39733 
39734   int ParitySrc[2] = {-1, -1};
39735   unsigned Size = Mask.size();
39736   for (unsigned i = 0; i != Size; ++i) {
39737     int M = Mask[i];
39738     if (M < 0)
39739       continue;
39740 
39741     // Make sure we are using the matching element from the input.
39742     if ((M % Size) != i)
39743       return false;
39744 
39745     // Make sure we use the same input for all elements of the same parity.
39746     int Src = M / Size;
39747     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
39748       return false;
39749     ParitySrc[i % 2] = Src;
39750   }
39751 
39752   // Make sure each input is used.
39753   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
39754     return false;
39755 
39756   Op0Even = ParitySrc[0] == 0;
39757   return true;
39758 }
39759 
39760 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
39761 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
39762 /// are written to the parameters \p Opnd0 and \p Opnd1.
39763 ///
39764 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
39765 /// so it is easier to generically match. We also insert dummy vector shuffle
39766 /// nodes for the operands which explicitly discard the lanes which are unused
39767 /// by this operation to try to flow through the rest of the combiner
39768 /// the fact that they're unused.
39769 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
39770                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
39771                              bool &IsSubAdd) {
39772 
39773   EVT VT = N->getValueType(0);
39774   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39775   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
39776       !VT.getSimpleVT().isFloatingPoint())
39777     return false;
39778 
39779   // We only handle target-independent shuffles.
39780   // FIXME: It would be easy and harmless to use the target shuffle mask
39781   // extraction tool to support more.
39782   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
39783     return false;
39784 
39785   SDValue V1 = N->getOperand(0);
39786   SDValue V2 = N->getOperand(1);
39787 
39788   // Make sure we have an FADD and an FSUB.
39789   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
39790       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
39791       V1.getOpcode() == V2.getOpcode())
39792     return false;
39793 
39794   // If there are other uses of these operations we can't fold them.
39795   if (!V1->hasOneUse() || !V2->hasOneUse())
39796     return false;
39797 
39798   // Ensure that both operations have the same operands. Note that we can
39799   // commute the FADD operands.
39800   SDValue LHS, RHS;
39801   if (V1.getOpcode() == ISD::FSUB) {
39802     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
39803     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
39804         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
39805       return false;
39806   } else {
39807     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
39808     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
39809     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
39810         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
39811       return false;
39812   }
39813 
39814   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
39815   bool Op0Even;
39816   if (!isAddSubOrSubAddMask(Mask, Op0Even))
39817     return false;
39818 
39819   // It's a subadd if the vector in the even parity is an FADD.
39820   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
39821                      : V2->getOpcode() == ISD::FADD;
39822 
39823   Opnd0 = LHS;
39824   Opnd1 = RHS;
39825   return true;
39826 }
39827 
39828 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
39829 static SDValue combineShuffleToFMAddSub(SDNode *N,
39830                                         const X86Subtarget &Subtarget,
39831                                         SelectionDAG &DAG) {
39832   // We only handle target-independent shuffles.
39833   // FIXME: It would be easy and harmless to use the target shuffle mask
39834   // extraction tool to support more.
39835   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
39836     return SDValue();
39837 
39838   MVT VT = N->getSimpleValueType(0);
39839   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39840   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
39841     return SDValue();
39842 
39843   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
39844   SDValue Op0 = N->getOperand(0);
39845   SDValue Op1 = N->getOperand(1);
39846   SDValue FMAdd = Op0, FMSub = Op1;
39847   if (FMSub.getOpcode() != X86ISD::FMSUB)
39848     std::swap(FMAdd, FMSub);
39849 
39850   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
39851       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
39852       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
39853       FMAdd.getOperand(2) != FMSub.getOperand(2))
39854     return SDValue();
39855 
39856   // Check for correct shuffle mask.
39857   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
39858   bool Op0Even;
39859   if (!isAddSubOrSubAddMask(Mask, Op0Even))
39860     return SDValue();
39861 
39862   // FMAddSub takes zeroth operand from FMSub node.
39863   SDLoc DL(N);
39864   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
39865   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
39866   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
39867                      FMAdd.getOperand(2));
39868 }
39869 
39870 /// Try to combine a shuffle into a target-specific add-sub or
39871 /// mul-add-sub node.
39872 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
39873                                                 const X86Subtarget &Subtarget,
39874                                                 SelectionDAG &DAG) {
39875   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
39876     return V;
39877 
39878   SDValue Opnd0, Opnd1;
39879   bool IsSubAdd;
39880   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
39881     return SDValue();
39882 
39883   MVT VT = N->getSimpleValueType(0);
39884   SDLoc DL(N);
39885 
39886   // Try to generate X86ISD::FMADDSUB node here.
39887   SDValue Opnd2;
39888   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
39889     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
39890     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
39891   }
39892 
39893   if (IsSubAdd)
39894     return SDValue();
39895 
39896   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
39897   // the ADDSUB idiom has been successfully recognized. There are no known
39898   // X86 targets with 512-bit ADDSUB instructions!
39899   if (VT.is512BitVector())
39900     return SDValue();
39901 
39902   // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
39903   // the ADDSUB idiom has been successfully recognized. There are no known
39904   // X86 targets with FP16 ADDSUB instructions!
39905   if (VT.getVectorElementType() == MVT::f16)
39906     return SDValue();
39907 
39908   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
39909 }
39910 
39911 // We are looking for a shuffle where both sources are concatenated with undef
39912 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
39913 // if we can express this as a single-source shuffle, that's preferable.
39914 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
39915                                            const X86Subtarget &Subtarget) {
39916   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
39917     return SDValue();
39918 
39919   EVT VT = N->getValueType(0);
39920 
39921   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
39922   if (!VT.is128BitVector() && !VT.is256BitVector())
39923     return SDValue();
39924 
39925   if (VT.getVectorElementType() != MVT::i32 &&
39926       VT.getVectorElementType() != MVT::i64 &&
39927       VT.getVectorElementType() != MVT::f32 &&
39928       VT.getVectorElementType() != MVT::f64)
39929     return SDValue();
39930 
39931   SDValue N0 = N->getOperand(0);
39932   SDValue N1 = N->getOperand(1);
39933 
39934   // Check that both sources are concats with undef.
39935   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
39936       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
39937       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
39938       !N1.getOperand(1).isUndef())
39939     return SDValue();
39940 
39941   // Construct the new shuffle mask. Elements from the first source retain their
39942   // index, but elements from the second source no longer need to skip an undef.
39943   SmallVector<int, 8> Mask;
39944   int NumElts = VT.getVectorNumElements();
39945 
39946   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
39947   for (int Elt : SVOp->getMask())
39948     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
39949 
39950   SDLoc DL(N);
39951   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
39952                                N1.getOperand(0));
39953   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
39954 }
39955 
39956 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
39957 /// low half of each source vector and does not set any high half elements in
39958 /// the destination vector, narrow the shuffle to half its original size.
39959 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
39960   if (!Shuf->getValueType(0).isSimple())
39961     return SDValue();
39962   MVT VT = Shuf->getSimpleValueType(0);
39963   if (!VT.is256BitVector() && !VT.is512BitVector())
39964     return SDValue();
39965 
39966   // See if we can ignore all of the high elements of the shuffle.
39967   ArrayRef<int> Mask = Shuf->getMask();
39968   if (!isUndefUpperHalf(Mask))
39969     return SDValue();
39970 
39971   // Check if the shuffle mask accesses only the low half of each input vector
39972   // (half-index output is 0 or 2).
39973   int HalfIdx1, HalfIdx2;
39974   SmallVector<int, 8> HalfMask(Mask.size() / 2);
39975   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
39976       (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
39977     return SDValue();
39978 
39979   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
39980   // The trick is knowing that all of the insert/extract are actually free
39981   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
39982   // of narrow inputs into a narrow output, and that is always cheaper than
39983   // the wide shuffle that we started with.
39984   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
39985                                Shuf->getOperand(1), HalfMask, HalfIdx1,
39986                                HalfIdx2, false, DAG, /*UseConcat*/true);
39987 }
39988 
39989 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
39990                               TargetLowering::DAGCombinerInfo &DCI,
39991                               const X86Subtarget &Subtarget) {
39992   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
39993     if (SDValue V = narrowShuffle(Shuf, DAG))
39994       return V;
39995 
39996   // If we have legalized the vector types, look for blends of FADD and FSUB
39997   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
39998   SDLoc dl(N);
39999   EVT VT = N->getValueType(0);
40000   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40001   if (TLI.isTypeLegal(VT))
40002     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
40003       return AddSub;
40004 
40005   // Attempt to combine into a vector load/broadcast.
40006   if (SDValue LD = combineToConsecutiveLoads(
40007           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
40008     return LD;
40009 
40010   // For AVX2, we sometimes want to combine
40011   // (vector_shuffle <mask> (concat_vectors t1, undef)
40012   //                        (concat_vectors t2, undef))
40013   // Into:
40014   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
40015   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
40016   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
40017     return ShufConcat;
40018 
40019   if (isTargetShuffle(N->getOpcode())) {
40020     SDValue Op(N, 0);
40021     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
40022       return Shuffle;
40023 
40024     // Try recursively combining arbitrary sequences of x86 shuffle
40025     // instructions into higher-order shuffles. We do this after combining
40026     // specific PSHUF instruction sequences into their minimal form so that we
40027     // can evaluate how many specialized shuffle instructions are involved in
40028     // a particular chain.
40029     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40030       return Res;
40031 
40032     // Simplify source operands based on shuffle mask.
40033     // TODO - merge this into combineX86ShufflesRecursively.
40034     APInt KnownUndef, KnownZero;
40035     APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
40036     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
40037                                        DCI))
40038       return SDValue(N, 0);
40039 
40040     // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40041     // Perform this after other shuffle combines to allow inner shuffles to be
40042     // combined away first.
40043     if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))
40044       return BinOp;
40045   }
40046 
40047   return SDValue();
40048 }
40049 
40050 // Simplify variable target shuffle masks based on the demanded elements.
40051 // TODO: Handle DemandedBits in mask indices as well?
40052 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
40053     SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
40054     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
40055   // If we're demanding all elements don't bother trying to simplify the mask.
40056   unsigned NumElts = DemandedElts.getBitWidth();
40057   if (DemandedElts.isAllOnes())
40058     return false;
40059 
40060   SDValue Mask = Op.getOperand(MaskIndex);
40061   if (!Mask.hasOneUse())
40062     return false;
40063 
40064   // Attempt to generically simplify the variable shuffle mask.
40065   APInt MaskUndef, MaskZero;
40066   if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
40067                                  Depth + 1))
40068     return true;
40069 
40070   // Attempt to extract+simplify a (constant pool load) shuffle mask.
40071   // TODO: Support other types from getTargetShuffleMaskIndices?
40072   SDValue BC = peekThroughOneUseBitcasts(Mask);
40073   EVT BCVT = BC.getValueType();
40074   auto *Load = dyn_cast<LoadSDNode>(BC);
40075   if (!Load)
40076     return false;
40077 
40078   const Constant *C = getTargetConstantFromNode(Load);
40079   if (!C)
40080     return false;
40081 
40082   Type *CTy = C->getType();
40083   if (!CTy->isVectorTy() ||
40084       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
40085     return false;
40086 
40087   // Handle scaling for i64 elements on 32-bit targets.
40088   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
40089   if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
40090     return false;
40091   unsigned Scale = NumCstElts / NumElts;
40092 
40093   // Simplify mask if we have an undemanded element that is not undef.
40094   bool Simplified = false;
40095   SmallVector<Constant *, 32> ConstVecOps;
40096   for (unsigned i = 0; i != NumCstElts; ++i) {
40097     Constant *Elt = C->getAggregateElement(i);
40098     if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
40099       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
40100       Simplified = true;
40101       continue;
40102     }
40103     ConstVecOps.push_back(Elt);
40104   }
40105   if (!Simplified)
40106     return false;
40107 
40108   // Generate new constant pool entry + legalize immediately for the load.
40109   SDLoc DL(Op);
40110   SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
40111   SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
40112   SDValue NewMask = TLO.DAG.getLoad(
40113       BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
40114       MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
40115       Load->getAlign());
40116   return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
40117 }
40118 
40119 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
40120     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
40121     TargetLoweringOpt &TLO, unsigned Depth) const {
40122   int NumElts = DemandedElts.getBitWidth();
40123   unsigned Opc = Op.getOpcode();
40124   EVT VT = Op.getValueType();
40125 
40126   // Handle special case opcodes.
40127   switch (Opc) {
40128   case X86ISD::PMULDQ:
40129   case X86ISD::PMULUDQ: {
40130     APInt LHSUndef, LHSZero;
40131     APInt RHSUndef, RHSZero;
40132     SDValue LHS = Op.getOperand(0);
40133     SDValue RHS = Op.getOperand(1);
40134     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40135                                    Depth + 1))
40136       return true;
40137     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40138                                    Depth + 1))
40139       return true;
40140     // Multiply by zero.
40141     KnownZero = LHSZero | RHSZero;
40142     break;
40143   }
40144   case X86ISD::VPMADDWD: {
40145     APInt LHSUndef, LHSZero;
40146     APInt RHSUndef, RHSZero;
40147     SDValue LHS = Op.getOperand(0);
40148     SDValue RHS = Op.getOperand(1);
40149     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
40150 
40151     if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
40152                                    Depth + 1))
40153       return true;
40154     if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
40155                                    Depth + 1))
40156       return true;
40157 
40158     // TODO: Multiply by zero.
40159 
40160     // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
40161     APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
40162     if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
40163                                    Depth + 1))
40164       return true;
40165     APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
40166     if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
40167                                    Depth + 1))
40168       return true;
40169     break;
40170   }
40171   case X86ISD::PSADBW: {
40172     SDValue LHS = Op.getOperand(0);
40173     SDValue RHS = Op.getOperand(1);
40174     assert(VT.getScalarType() == MVT::i64 &&
40175            LHS.getValueType() == RHS.getValueType() &&
40176            LHS.getValueType().getScalarType() == MVT::i8 &&
40177            "Unexpected PSADBW types");
40178 
40179     // Aggressively peek through ops to get at the demanded elts.
40180     if (!DemandedElts.isAllOnes()) {
40181       unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
40182       APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
40183       SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
40184           LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40185       SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
40186           RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40187       if (NewLHS || NewRHS) {
40188         NewLHS = NewLHS ? NewLHS : LHS;
40189         NewRHS = NewRHS ? NewRHS : RHS;
40190         return TLO.CombineTo(
40191             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40192       }
40193     }
40194     break;
40195   }
40196   case X86ISD::VSHL:
40197   case X86ISD::VSRL:
40198   case X86ISD::VSRA: {
40199     // We only need the bottom 64-bits of the (128-bit) shift amount.
40200     SDValue Amt = Op.getOperand(1);
40201     MVT AmtVT = Amt.getSimpleValueType();
40202     assert(AmtVT.is128BitVector() && "Unexpected value type");
40203 
40204     // If we reuse the shift amount just for sse shift amounts then we know that
40205     // only the bottom 64-bits are only ever used.
40206     bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
40207       unsigned UseOpc = Use->getOpcode();
40208       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
40209               UseOpc == X86ISD::VSRA) &&
40210              Use->getOperand(0) != Amt;
40211     });
40212 
40213     APInt AmtUndef, AmtZero;
40214     unsigned NumAmtElts = AmtVT.getVectorNumElements();
40215     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
40216     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
40217                                    Depth + 1, AssumeSingleUse))
40218       return true;
40219     LLVM_FALLTHROUGH;
40220   }
40221   case X86ISD::VSHLI:
40222   case X86ISD::VSRLI:
40223   case X86ISD::VSRAI: {
40224     SDValue Src = Op.getOperand(0);
40225     APInt SrcUndef;
40226     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
40227                                    Depth + 1))
40228       return true;
40229 
40230     // Aggressively peek through ops to get at the demanded elts.
40231     if (!DemandedElts.isAllOnes())
40232       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40233               Src, DemandedElts, TLO.DAG, Depth + 1))
40234         return TLO.CombineTo(
40235             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
40236     break;
40237   }
40238   case X86ISD::VPSHA:
40239   case X86ISD::VPSHL:
40240   case X86ISD::VSHLV:
40241   case X86ISD::VSRLV:
40242   case X86ISD::VSRAV: {
40243     APInt LHSUndef, LHSZero;
40244     APInt RHSUndef, RHSZero;
40245     SDValue LHS = Op.getOperand(0);
40246     SDValue RHS = Op.getOperand(1);
40247     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40248                                    Depth + 1))
40249       return true;
40250     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40251                                    Depth + 1))
40252       return true;
40253     KnownZero = LHSZero;
40254     break;
40255   }
40256   case X86ISD::KSHIFTL: {
40257     SDValue Src = Op.getOperand(0);
40258     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40259     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
40260     unsigned ShiftAmt = Amt->getZExtValue();
40261 
40262     if (ShiftAmt == 0)
40263       return TLO.CombineTo(Op, Src);
40264 
40265     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
40266     // single shift.  We can do this if the bottom bits (which are shifted
40267     // out) are never demanded.
40268     if (Src.getOpcode() == X86ISD::KSHIFTR) {
40269       if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
40270         unsigned C1 = Src.getConstantOperandVal(1);
40271         unsigned NewOpc = X86ISD::KSHIFTL;
40272         int Diff = ShiftAmt - C1;
40273         if (Diff < 0) {
40274           Diff = -Diff;
40275           NewOpc = X86ISD::KSHIFTR;
40276         }
40277 
40278         SDLoc dl(Op);
40279         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40280         return TLO.CombineTo(
40281             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40282       }
40283     }
40284 
40285     APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
40286     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40287                                    Depth + 1))
40288       return true;
40289 
40290     KnownUndef <<= ShiftAmt;
40291     KnownZero <<= ShiftAmt;
40292     KnownZero.setLowBits(ShiftAmt);
40293     break;
40294   }
40295   case X86ISD::KSHIFTR: {
40296     SDValue Src = Op.getOperand(0);
40297     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40298     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
40299     unsigned ShiftAmt = Amt->getZExtValue();
40300 
40301     if (ShiftAmt == 0)
40302       return TLO.CombineTo(Op, Src);
40303 
40304     // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
40305     // single shift.  We can do this if the top bits (which are shifted
40306     // out) are never demanded.
40307     if (Src.getOpcode() == X86ISD::KSHIFTL) {
40308       if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
40309         unsigned C1 = Src.getConstantOperandVal(1);
40310         unsigned NewOpc = X86ISD::KSHIFTR;
40311         int Diff = ShiftAmt - C1;
40312         if (Diff < 0) {
40313           Diff = -Diff;
40314           NewOpc = X86ISD::KSHIFTL;
40315         }
40316 
40317         SDLoc dl(Op);
40318         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40319         return TLO.CombineTo(
40320             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40321       }
40322     }
40323 
40324     APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
40325     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40326                                    Depth + 1))
40327       return true;
40328 
40329     KnownUndef.lshrInPlace(ShiftAmt);
40330     KnownZero.lshrInPlace(ShiftAmt);
40331     KnownZero.setHighBits(ShiftAmt);
40332     break;
40333   }
40334   case X86ISD::CVTSI2P:
40335   case X86ISD::CVTUI2P: {
40336     SDValue Src = Op.getOperand(0);
40337     MVT SrcVT = Src.getSimpleValueType();
40338     APInt SrcUndef, SrcZero;
40339     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
40340     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
40341                                    Depth + 1))
40342       return true;
40343     break;
40344   }
40345   case X86ISD::PACKSS:
40346   case X86ISD::PACKUS: {
40347     SDValue N0 = Op.getOperand(0);
40348     SDValue N1 = Op.getOperand(1);
40349 
40350     APInt DemandedLHS, DemandedRHS;
40351     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
40352 
40353     APInt LHSUndef, LHSZero;
40354     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
40355                                    Depth + 1))
40356       return true;
40357     APInt RHSUndef, RHSZero;
40358     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
40359                                    Depth + 1))
40360       return true;
40361 
40362     // TODO - pass on known zero/undef.
40363 
40364     // Aggressively peek through ops to get at the demanded elts.
40365     // TODO - we should do this for all target/faux shuffles ops.
40366     if (!DemandedElts.isAllOnes()) {
40367       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
40368                                                             TLO.DAG, Depth + 1);
40369       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
40370                                                             TLO.DAG, Depth + 1);
40371       if (NewN0 || NewN1) {
40372         NewN0 = NewN0 ? NewN0 : N0;
40373         NewN1 = NewN1 ? NewN1 : N1;
40374         return TLO.CombineTo(Op,
40375                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
40376       }
40377     }
40378     break;
40379   }
40380   case X86ISD::HADD:
40381   case X86ISD::HSUB:
40382   case X86ISD::FHADD:
40383   case X86ISD::FHSUB: {
40384     SDValue N0 = Op.getOperand(0);
40385     SDValue N1 = Op.getOperand(1);
40386 
40387     APInt DemandedLHS, DemandedRHS;
40388     getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
40389 
40390     APInt LHSUndef, LHSZero;
40391     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
40392                                    Depth + 1))
40393       return true;
40394     APInt RHSUndef, RHSZero;
40395     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
40396                                    Depth + 1))
40397       return true;
40398 
40399     // TODO - pass on known zero/undef.
40400 
40401     // Aggressively peek through ops to get at the demanded elts.
40402     // TODO: Handle repeated operands.
40403     if (N0 != N1 && !DemandedElts.isAllOnes()) {
40404       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
40405                                                             TLO.DAG, Depth + 1);
40406       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
40407                                                             TLO.DAG, Depth + 1);
40408       if (NewN0 || NewN1) {
40409         NewN0 = NewN0 ? NewN0 : N0;
40410         NewN1 = NewN1 ? NewN1 : N1;
40411         return TLO.CombineTo(Op,
40412                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
40413       }
40414     }
40415     break;
40416   }
40417   case X86ISD::VTRUNC:
40418   case X86ISD::VTRUNCS:
40419   case X86ISD::VTRUNCUS: {
40420     SDValue Src = Op.getOperand(0);
40421     MVT SrcVT = Src.getSimpleValueType();
40422     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
40423     APInt SrcUndef, SrcZero;
40424     if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
40425                                    Depth + 1))
40426       return true;
40427     KnownZero = SrcZero.zextOrTrunc(NumElts);
40428     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
40429     break;
40430   }
40431   case X86ISD::BLENDV: {
40432     APInt SelUndef, SelZero;
40433     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
40434                                    SelZero, TLO, Depth + 1))
40435       return true;
40436 
40437     // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
40438     APInt LHSUndef, LHSZero;
40439     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
40440                                    LHSZero, TLO, Depth + 1))
40441       return true;
40442 
40443     APInt RHSUndef, RHSZero;
40444     if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
40445                                    RHSZero, TLO, Depth + 1))
40446       return true;
40447 
40448     KnownZero = LHSZero & RHSZero;
40449     KnownUndef = LHSUndef & RHSUndef;
40450     break;
40451   }
40452   case X86ISD::VZEXT_MOVL: {
40453     // If upper demanded elements are already zero then we have nothing to do.
40454     SDValue Src = Op.getOperand(0);
40455     APInt DemandedUpperElts = DemandedElts;
40456     DemandedUpperElts.clearLowBits(1);
40457     if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
40458       return TLO.CombineTo(Op, Src);
40459     break;
40460   }
40461   case X86ISD::VBROADCAST: {
40462     SDValue Src = Op.getOperand(0);
40463     MVT SrcVT = Src.getSimpleValueType();
40464     if (!SrcVT.isVector())
40465       break;
40466     // Don't bother broadcasting if we just need the 0'th element.
40467     if (DemandedElts == 1) {
40468       if (Src.getValueType() != VT)
40469         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
40470                              SDLoc(Op));
40471       return TLO.CombineTo(Op, Src);
40472     }
40473     APInt SrcUndef, SrcZero;
40474     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
40475     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
40476                                    Depth + 1))
40477       return true;
40478     // Aggressively peek through src to get at the demanded elt.
40479     // TODO - we should do this for all target/faux shuffles ops.
40480     if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40481             Src, SrcElts, TLO.DAG, Depth + 1))
40482       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
40483     break;
40484   }
40485   case X86ISD::VPERMV:
40486     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
40487                                                    Depth))
40488       return true;
40489     break;
40490   case X86ISD::PSHUFB:
40491   case X86ISD::VPERMV3:
40492   case X86ISD::VPERMILPV:
40493     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
40494                                                    Depth))
40495       return true;
40496     break;
40497   case X86ISD::VPPERM:
40498   case X86ISD::VPERMIL2:
40499     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
40500                                                    Depth))
40501       return true;
40502     break;
40503   }
40504 
40505   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
40506   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
40507   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
40508   if ((VT.is256BitVector() || VT.is512BitVector()) &&
40509       DemandedElts.lshr(NumElts / 2) == 0) {
40510     unsigned SizeInBits = VT.getSizeInBits();
40511     unsigned ExtSizeInBits = SizeInBits / 2;
40512 
40513     // See if 512-bit ops only use the bottom 128-bits.
40514     if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
40515       ExtSizeInBits = SizeInBits / 4;
40516 
40517     switch (Opc) {
40518       // Scalar broadcast.
40519     case X86ISD::VBROADCAST: {
40520       SDLoc DL(Op);
40521       SDValue Src = Op.getOperand(0);
40522       if (Src.getValueSizeInBits() > ExtSizeInBits)
40523         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
40524       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40525                                     ExtSizeInBits / VT.getScalarSizeInBits());
40526       SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
40527       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
40528                                                TLO.DAG, DL, ExtSizeInBits));
40529     }
40530     case X86ISD::VBROADCAST_LOAD: {
40531       SDLoc DL(Op);
40532       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
40533       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40534                                     ExtSizeInBits / VT.getScalarSizeInBits());
40535       SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
40536       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
40537       SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
40538           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
40539           MemIntr->getMemOperand());
40540       TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
40541                                            Bcst.getValue(1));
40542       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
40543                                                TLO.DAG, DL, ExtSizeInBits));
40544     }
40545       // Subvector broadcast.
40546     case X86ISD::SUBV_BROADCAST_LOAD: {
40547       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
40548       EVT MemVT = MemIntr->getMemoryVT();
40549       if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
40550         SDLoc DL(Op);
40551         SDValue Ld =
40552             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
40553                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
40554         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
40555                                              Ld.getValue(1));
40556         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
40557                                                  TLO.DAG, DL, ExtSizeInBits));
40558       } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
40559         SDLoc DL(Op);
40560         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
40561                                       ExtSizeInBits / VT.getScalarSizeInBits());
40562         if (SDValue BcstLd =
40563                 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
40564           return TLO.CombineTo(Op,
40565                                insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
40566                                                TLO.DAG, DL, ExtSizeInBits));
40567       }
40568       break;
40569     }
40570       // Byte shifts by immediate.
40571     case X86ISD::VSHLDQ:
40572     case X86ISD::VSRLDQ:
40573       // Shift by uniform.
40574     case X86ISD::VSHL:
40575     case X86ISD::VSRL:
40576     case X86ISD::VSRA:
40577       // Shift by immediate.
40578     case X86ISD::VSHLI:
40579     case X86ISD::VSRLI:
40580     case X86ISD::VSRAI: {
40581       SDLoc DL(Op);
40582       SDValue Ext0 =
40583           extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
40584       SDValue ExtOp =
40585           TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
40586       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40587       SDValue Insert =
40588           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40589       return TLO.CombineTo(Op, Insert);
40590     }
40591     case X86ISD::VPERMI: {
40592       // Simplify PERMPD/PERMQ to extract_subvector.
40593       // TODO: This should be done in shuffle combining.
40594       if (VT == MVT::v4f64 || VT == MVT::v4i64) {
40595         SmallVector<int, 4> Mask;
40596         DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
40597         if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
40598           SDLoc DL(Op);
40599           SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
40600           SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40601           SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
40602           return TLO.CombineTo(Op, Insert);
40603         }
40604       }
40605       break;
40606     }
40607     case X86ISD::VPERM2X128: {
40608       // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
40609       SDLoc DL(Op);
40610       unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
40611       if (LoMask & 0x8)
40612         return TLO.CombineTo(
40613             Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
40614       unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
40615       unsigned SrcIdx = (LoMask & 0x2) >> 1;
40616       SDValue ExtOp =
40617           extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
40618       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40619       SDValue Insert =
40620           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40621       return TLO.CombineTo(Op, Insert);
40622     }
40623       // Zero upper elements.
40624     case X86ISD::VZEXT_MOVL:
40625       // Target unary shuffles by immediate:
40626     case X86ISD::PSHUFD:
40627     case X86ISD::PSHUFLW:
40628     case X86ISD::PSHUFHW:
40629     case X86ISD::VPERMILPI:
40630       // (Non-Lane Crossing) Target Shuffles.
40631     case X86ISD::VPERMILPV:
40632     case X86ISD::VPERMIL2:
40633     case X86ISD::PSHUFB:
40634     case X86ISD::UNPCKL:
40635     case X86ISD::UNPCKH:
40636     case X86ISD::BLENDI:
40637       // Integer ops.
40638     case X86ISD::AVG:
40639     case X86ISD::PACKSS:
40640     case X86ISD::PACKUS:
40641       // Horizontal Ops.
40642     case X86ISD::HADD:
40643     case X86ISD::HSUB:
40644     case X86ISD::FHADD:
40645     case X86ISD::FHSUB: {
40646       SDLoc DL(Op);
40647       SmallVector<SDValue, 4> Ops;
40648       for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
40649         SDValue SrcOp = Op.getOperand(i);
40650         EVT SrcVT = SrcOp.getValueType();
40651         assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
40652                "Unsupported vector size");
40653         Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
40654                                                           ExtSizeInBits)
40655                                        : SrcOp);
40656       }
40657       MVT ExtVT = VT.getSimpleVT();
40658       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
40659                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
40660       SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
40661       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
40662       SDValue Insert =
40663           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
40664       return TLO.CombineTo(Op, Insert);
40665     }
40666     }
40667   }
40668 
40669   // For broadcasts, unless we *only* demand the 0'th element,
40670   // stop attempts at simplification here, we aren't going to improve things,
40671   // this is better than any potential shuffle.
40672   if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())
40673     return false;
40674 
40675   // Get target/faux shuffle mask.
40676   APInt OpUndef, OpZero;
40677   SmallVector<int, 64> OpMask;
40678   SmallVector<SDValue, 2> OpInputs;
40679   if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
40680                               OpZero, TLO.DAG, Depth, false))
40681     return false;
40682 
40683   // Shuffle inputs must be the same size as the result.
40684   if (OpMask.size() != (unsigned)NumElts ||
40685       llvm::any_of(OpInputs, [VT](SDValue V) {
40686         return VT.getSizeInBits() != V.getValueSizeInBits() ||
40687                !V.getValueType().isVector();
40688       }))
40689     return false;
40690 
40691   KnownZero = OpZero;
40692   KnownUndef = OpUndef;
40693 
40694   // Check if shuffle mask can be simplified to undef/zero/identity.
40695   int NumSrcs = OpInputs.size();
40696   for (int i = 0; i != NumElts; ++i)
40697     if (!DemandedElts[i])
40698       OpMask[i] = SM_SentinelUndef;
40699 
40700   if (isUndefInRange(OpMask, 0, NumElts)) {
40701     KnownUndef.setAllBits();
40702     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
40703   }
40704   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
40705     KnownZero.setAllBits();
40706     return TLO.CombineTo(
40707         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
40708   }
40709   for (int Src = 0; Src != NumSrcs; ++Src)
40710     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
40711       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
40712 
40713   // Attempt to simplify inputs.
40714   for (int Src = 0; Src != NumSrcs; ++Src) {
40715     // TODO: Support inputs of different types.
40716     if (OpInputs[Src].getValueType() != VT)
40717       continue;
40718 
40719     int Lo = Src * NumElts;
40720     APInt SrcElts = APInt::getZero(NumElts);
40721     for (int i = 0; i != NumElts; ++i)
40722       if (DemandedElts[i]) {
40723         int M = OpMask[i] - Lo;
40724         if (0 <= M && M < NumElts)
40725           SrcElts.setBit(M);
40726       }
40727 
40728     // TODO - Propagate input undef/zero elts.
40729     APInt SrcUndef, SrcZero;
40730     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
40731                                    TLO, Depth + 1))
40732       return true;
40733   }
40734 
40735   // If we don't demand all elements, then attempt to combine to a simpler
40736   // shuffle.
40737   // We need to convert the depth to something combineX86ShufflesRecursively
40738   // can handle - so pretend its Depth == 0 again, and reduce the max depth
40739   // to match. This prevents combineX86ShuffleChain from returning a
40740   // combined shuffle that's the same as the original root, causing an
40741   // infinite loop.
40742   if (!DemandedElts.isAllOnes()) {
40743     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
40744 
40745     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
40746     for (int i = 0; i != NumElts; ++i)
40747       if (DemandedElts[i])
40748         DemandedMask[i] = i;
40749 
40750     SDValue NewShuffle = combineX86ShufflesRecursively(
40751         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
40752         /*HasVarMask*/ false,
40753         /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
40754         Subtarget);
40755     if (NewShuffle)
40756       return TLO.CombineTo(Op, NewShuffle);
40757   }
40758 
40759   return false;
40760 }
40761 
40762 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
40763     SDValue Op, const APInt &OriginalDemandedBits,
40764     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
40765     unsigned Depth) const {
40766   EVT VT = Op.getValueType();
40767   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
40768   unsigned Opc = Op.getOpcode();
40769   switch(Opc) {
40770   case X86ISD::VTRUNC: {
40771     KnownBits KnownOp;
40772     SDValue Src = Op.getOperand(0);
40773     MVT SrcVT = Src.getSimpleValueType();
40774 
40775     // Simplify the input, using demanded bit information.
40776     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
40777     APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
40778     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
40779       return true;
40780     break;
40781   }
40782   case X86ISD::PMULDQ:
40783   case X86ISD::PMULUDQ: {
40784     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
40785     KnownBits KnownOp;
40786     SDValue LHS = Op.getOperand(0);
40787     SDValue RHS = Op.getOperand(1);
40788     // FIXME: Can we bound this better?
40789     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
40790     if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
40791                              TLO, Depth + 1))
40792       return true;
40793     if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
40794                              TLO, Depth + 1))
40795       return true;
40796 
40797     // Aggressively peek through ops to get at the demanded low bits.
40798     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
40799         LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
40800     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
40801         RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
40802     if (DemandedLHS || DemandedRHS) {
40803       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
40804       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
40805       return TLO.CombineTo(
40806           Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
40807     }
40808     break;
40809   }
40810   case X86ISD::VSHLI: {
40811     SDValue Op0 = Op.getOperand(0);
40812 
40813     unsigned ShAmt = Op.getConstantOperandVal(1);
40814     if (ShAmt >= BitWidth)
40815       break;
40816 
40817     APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
40818 
40819     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
40820     // single shift.  We can do this if the bottom bits (which are shifted
40821     // out) are never demanded.
40822     if (Op0.getOpcode() == X86ISD::VSRLI &&
40823         OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
40824       unsigned Shift2Amt = Op0.getConstantOperandVal(1);
40825       if (Shift2Amt < BitWidth) {
40826         int Diff = ShAmt - Shift2Amt;
40827         if (Diff == 0)
40828           return TLO.CombineTo(Op, Op0.getOperand(0));
40829 
40830         unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
40831         SDValue NewShift = TLO.DAG.getNode(
40832             NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
40833             TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
40834         return TLO.CombineTo(Op, NewShift);
40835       }
40836     }
40837 
40838     // If we are only demanding sign bits then we can use the shift source directly.
40839     unsigned NumSignBits =
40840         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
40841     unsigned UpperDemandedBits =
40842         BitWidth - OriginalDemandedBits.countTrailingZeros();
40843     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
40844       return TLO.CombineTo(Op, Op0);
40845 
40846     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
40847                              TLO, Depth + 1))
40848       return true;
40849 
40850     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
40851     Known.Zero <<= ShAmt;
40852     Known.One <<= ShAmt;
40853 
40854     // Low bits known zero.
40855     Known.Zero.setLowBits(ShAmt);
40856     return false;
40857   }
40858   case X86ISD::VSRLI: {
40859     unsigned ShAmt = Op.getConstantOperandVal(1);
40860     if (ShAmt >= BitWidth)
40861       break;
40862 
40863     APInt DemandedMask = OriginalDemandedBits << ShAmt;
40864 
40865     if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
40866                              OriginalDemandedElts, Known, TLO, Depth + 1))
40867       return true;
40868 
40869     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
40870     Known.Zero.lshrInPlace(ShAmt);
40871     Known.One.lshrInPlace(ShAmt);
40872 
40873     // High bits known zero.
40874     Known.Zero.setHighBits(ShAmt);
40875     return false;
40876   }
40877   case X86ISD::VSRAI: {
40878     SDValue Op0 = Op.getOperand(0);
40879     SDValue Op1 = Op.getOperand(1);
40880 
40881     unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
40882     if (ShAmt >= BitWidth)
40883       break;
40884 
40885     APInt DemandedMask = OriginalDemandedBits << ShAmt;
40886 
40887     // If we just want the sign bit then we don't need to shift it.
40888     if (OriginalDemandedBits.isSignMask())
40889       return TLO.CombineTo(Op, Op0);
40890 
40891     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
40892     if (Op0.getOpcode() == X86ISD::VSHLI &&
40893         Op.getOperand(1) == Op0.getOperand(1)) {
40894       SDValue Op00 = Op0.getOperand(0);
40895       unsigned NumSignBits =
40896           TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
40897       if (ShAmt < NumSignBits)
40898         return TLO.CombineTo(Op, Op00);
40899     }
40900 
40901     // If any of the demanded bits are produced by the sign extension, we also
40902     // demand the input sign bit.
40903     if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
40904       DemandedMask.setSignBit();
40905 
40906     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
40907                              TLO, Depth + 1))
40908       return true;
40909 
40910     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
40911     Known.Zero.lshrInPlace(ShAmt);
40912     Known.One.lshrInPlace(ShAmt);
40913 
40914     // If the input sign bit is known to be zero, or if none of the top bits
40915     // are demanded, turn this into an unsigned shift right.
40916     if (Known.Zero[BitWidth - ShAmt - 1] ||
40917         OriginalDemandedBits.countLeadingZeros() >= ShAmt)
40918       return TLO.CombineTo(
40919           Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
40920 
40921     // High bits are known one.
40922     if (Known.One[BitWidth - ShAmt - 1])
40923       Known.One.setHighBits(ShAmt);
40924     return false;
40925   }
40926   case X86ISD::BLENDV: {
40927     SDValue Sel = Op.getOperand(0);
40928     SDValue LHS = Op.getOperand(1);
40929     SDValue RHS = Op.getOperand(2);
40930 
40931     APInt SignMask = APInt::getSignMask(BitWidth);
40932     SDValue NewSel = SimplifyMultipleUseDemandedBits(
40933         Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
40934     SDValue NewLHS = SimplifyMultipleUseDemandedBits(
40935         LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
40936     SDValue NewRHS = SimplifyMultipleUseDemandedBits(
40937         RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
40938 
40939     if (NewSel || NewLHS || NewRHS) {
40940       NewSel = NewSel ? NewSel : Sel;
40941       NewLHS = NewLHS ? NewLHS : LHS;
40942       NewRHS = NewRHS ? NewRHS : RHS;
40943       return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
40944                                                NewSel, NewLHS, NewRHS));
40945     }
40946     break;
40947   }
40948   case X86ISD::PEXTRB:
40949   case X86ISD::PEXTRW: {
40950     SDValue Vec = Op.getOperand(0);
40951     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
40952     MVT VecVT = Vec.getSimpleValueType();
40953     unsigned NumVecElts = VecVT.getVectorNumElements();
40954 
40955     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
40956       unsigned Idx = CIdx->getZExtValue();
40957       unsigned VecBitWidth = VecVT.getScalarSizeInBits();
40958 
40959       // If we demand no bits from the vector then we must have demanded
40960       // bits from the implict zext - simplify to zero.
40961       APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
40962       if (DemandedVecBits == 0)
40963         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40964 
40965       APInt KnownUndef, KnownZero;
40966       APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
40967       if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
40968                                      KnownZero, TLO, Depth + 1))
40969         return true;
40970 
40971       KnownBits KnownVec;
40972       if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
40973                                KnownVec, TLO, Depth + 1))
40974         return true;
40975 
40976       if (SDValue V = SimplifyMultipleUseDemandedBits(
40977               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
40978         return TLO.CombineTo(
40979             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
40980 
40981       Known = KnownVec.zext(BitWidth);
40982       return false;
40983     }
40984     break;
40985   }
40986   case X86ISD::PINSRB:
40987   case X86ISD::PINSRW: {
40988     SDValue Vec = Op.getOperand(0);
40989     SDValue Scl = Op.getOperand(1);
40990     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
40991     MVT VecVT = Vec.getSimpleValueType();
40992 
40993     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
40994       unsigned Idx = CIdx->getZExtValue();
40995       if (!OriginalDemandedElts[Idx])
40996         return TLO.CombineTo(Op, Vec);
40997 
40998       KnownBits KnownVec;
40999       APInt DemandedVecElts(OriginalDemandedElts);
41000       DemandedVecElts.clearBit(Idx);
41001       if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
41002                                KnownVec, TLO, Depth + 1))
41003         return true;
41004 
41005       KnownBits KnownScl;
41006       unsigned NumSclBits = Scl.getScalarValueSizeInBits();
41007       APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
41008       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
41009         return true;
41010 
41011       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
41012       Known = KnownBits::commonBits(KnownVec, KnownScl);
41013       return false;
41014     }
41015     break;
41016   }
41017   case X86ISD::PACKSS:
41018     // PACKSS saturates to MIN/MAX integer values. So if we just want the
41019     // sign bit then we can just ask for the source operands sign bit.
41020     // TODO - add known bits handling.
41021     if (OriginalDemandedBits.isSignMask()) {
41022       APInt DemandedLHS, DemandedRHS;
41023       getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
41024 
41025       KnownBits KnownLHS, KnownRHS;
41026       APInt SignMask = APInt::getSignMask(BitWidth * 2);
41027       if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
41028                                KnownLHS, TLO, Depth + 1))
41029         return true;
41030       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
41031                                KnownRHS, TLO, Depth + 1))
41032         return true;
41033 
41034       // Attempt to avoid multi-use ops if we don't need anything from them.
41035       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
41036           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
41037       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
41038           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
41039       if (DemandedOp0 || DemandedOp1) {
41040         SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
41041         SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
41042         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
41043       }
41044     }
41045     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
41046     break;
41047   case X86ISD::VBROADCAST: {
41048     SDValue Src = Op.getOperand(0);
41049     MVT SrcVT = Src.getSimpleValueType();
41050     APInt DemandedElts = APInt::getOneBitSet(
41051         SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
41052     if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
41053                              TLO, Depth + 1))
41054       return true;
41055     // If we don't need the upper bits, attempt to narrow the broadcast source.
41056     // Don't attempt this on AVX512 as it might affect broadcast folding.
41057     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
41058     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
41059         OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&
41060         Src->hasOneUse()) {
41061       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
41062       SDValue NewSrc =
41063           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
41064       MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
41065       SDValue NewBcst =
41066           TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
41067       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
41068     }
41069     break;
41070   }
41071   case X86ISD::PCMPGT:
41072     // icmp sgt(0, R) == ashr(R, BitWidth-1).
41073     // iff we only need the sign bit then we can use R directly.
41074     if (OriginalDemandedBits.isSignMask() &&
41075         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41076       return TLO.CombineTo(Op, Op.getOperand(1));
41077     break;
41078   case X86ISD::MOVMSK: {
41079     SDValue Src = Op.getOperand(0);
41080     MVT SrcVT = Src.getSimpleValueType();
41081     unsigned SrcBits = SrcVT.getScalarSizeInBits();
41082     unsigned NumElts = SrcVT.getVectorNumElements();
41083 
41084     // If we don't need the sign bits at all just return zero.
41085     if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
41086       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41087 
41088     // See if we only demand bits from the lower 128-bit vector.
41089     if (SrcVT.is256BitVector() &&
41090         OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
41091       SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
41092       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41093     }
41094 
41095     // Only demand the vector elements of the sign bits we need.
41096     APInt KnownUndef, KnownZero;
41097     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
41098     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
41099                                    TLO, Depth + 1))
41100       return true;
41101 
41102     Known.Zero = KnownZero.zextOrSelf(BitWidth);
41103     Known.Zero.setHighBits(BitWidth - NumElts);
41104 
41105     // MOVMSK only uses the MSB from each vector element.
41106     KnownBits KnownSrc;
41107     APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
41108     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
41109                              Depth + 1))
41110       return true;
41111 
41112     if (KnownSrc.One[SrcBits - 1])
41113       Known.One.setLowBits(NumElts);
41114     else if (KnownSrc.Zero[SrcBits - 1])
41115       Known.Zero.setLowBits(NumElts);
41116 
41117     // Attempt to avoid multi-use os if we don't need anything from it.
41118     if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
41119             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
41120       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41121     return false;
41122   }
41123   case X86ISD::BEXTR:
41124   case X86ISD::BEXTRI: {
41125     SDValue Op0 = Op.getOperand(0);
41126     SDValue Op1 = Op.getOperand(1);
41127 
41128     // Only bottom 16-bits of the control bits are required.
41129     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
41130       // NOTE: SimplifyDemandedBits won't do this for constants.
41131       uint64_t Val1 = Cst1->getZExtValue();
41132       uint64_t MaskedVal1 = Val1 & 0xFFFF;
41133       if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
41134         SDLoc DL(Op);
41135         return TLO.CombineTo(
41136             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
41137                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
41138       }
41139 
41140       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
41141       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
41142 
41143       // If the length is 0, the result is 0.
41144       if (Length == 0) {
41145         Known.setAllZero();
41146         return false;
41147       }
41148 
41149       if ((Shift + Length) <= BitWidth) {
41150         APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
41151         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
41152           return true;
41153 
41154         Known = Known.extractBits(Length, Shift);
41155         Known = Known.zextOrTrunc(BitWidth);
41156         return false;
41157       }
41158     } else {
41159       assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
41160       KnownBits Known1;
41161       APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
41162       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
41163         return true;
41164 
41165       // If the length is 0, replace with 0.
41166       KnownBits LengthBits = Known1.extractBits(8, 8);
41167       if (LengthBits.isZero())
41168         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41169     }
41170 
41171     break;
41172   }
41173   case X86ISD::PDEP: {
41174     SDValue Op0 = Op.getOperand(0);
41175     SDValue Op1 = Op.getOperand(1);
41176 
41177     unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
41178     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
41179 
41180     // If the demanded bits has leading zeroes, we don't demand those from the
41181     // mask.
41182     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
41183       return true;
41184 
41185     // The number of possible 1s in the mask determines the number of LSBs of
41186     // operand 0 used. Undemanded bits from the mask don't matter so filter
41187     // them before counting.
41188     KnownBits Known2;
41189     uint64_t Count = (~Known.Zero & LoMask).countPopulation();
41190     APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
41191     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
41192       return true;
41193 
41194     // Zeroes are retained from the mask, but not ones.
41195     Known.One.clearAllBits();
41196     // The result will have at least as many trailing zeros as the non-mask
41197     // operand since bits can only map to the same or higher bit position.
41198     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
41199     return false;
41200   }
41201   }
41202 
41203   return TargetLowering::SimplifyDemandedBitsForTargetNode(
41204       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
41205 }
41206 
41207 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
41208     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
41209     SelectionDAG &DAG, unsigned Depth) const {
41210   int NumElts = DemandedElts.getBitWidth();
41211   unsigned Opc = Op.getOpcode();
41212   EVT VT = Op.getValueType();
41213 
41214   switch (Opc) {
41215   case X86ISD::PINSRB:
41216   case X86ISD::PINSRW: {
41217     // If we don't demand the inserted element, return the base vector.
41218     SDValue Vec = Op.getOperand(0);
41219     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41220     MVT VecVT = Vec.getSimpleValueType();
41221     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
41222         !DemandedElts[CIdx->getZExtValue()])
41223       return Vec;
41224     break;
41225   }
41226   case X86ISD::VSHLI: {
41227     // If we are only demanding sign bits then we can use the shift source
41228     // directly.
41229     SDValue Op0 = Op.getOperand(0);
41230     unsigned ShAmt = Op.getConstantOperandVal(1);
41231     unsigned BitWidth = DemandedBits.getBitWidth();
41232     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
41233     unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
41234     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41235       return Op0;
41236     break;
41237   }
41238   case X86ISD::VSRAI:
41239     // iff we only need the sign bit then we can use the source directly.
41240     // TODO: generalize where we only demand extended signbits.
41241     if (DemandedBits.isSignMask())
41242       return Op.getOperand(0);
41243     break;
41244   case X86ISD::PCMPGT:
41245     // icmp sgt(0, R) == ashr(R, BitWidth-1).
41246     // iff we only need the sign bit then we can use R directly.
41247     if (DemandedBits.isSignMask() &&
41248         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41249       return Op.getOperand(1);
41250     break;
41251   }
41252 
41253   APInt ShuffleUndef, ShuffleZero;
41254   SmallVector<int, 16> ShuffleMask;
41255   SmallVector<SDValue, 2> ShuffleOps;
41256   if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
41257                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
41258     // If all the demanded elts are from one operand and are inline,
41259     // then we can use the operand directly.
41260     int NumOps = ShuffleOps.size();
41261     if (ShuffleMask.size() == (unsigned)NumElts &&
41262         llvm::all_of(ShuffleOps, [VT](SDValue V) {
41263           return VT.getSizeInBits() == V.getValueSizeInBits();
41264         })) {
41265 
41266       if (DemandedElts.isSubsetOf(ShuffleUndef))
41267         return DAG.getUNDEF(VT);
41268       if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
41269         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
41270 
41271       // Bitmask that indicates which ops have only been accessed 'inline'.
41272       APInt IdentityOp = APInt::getAllOnes(NumOps);
41273       for (int i = 0; i != NumElts; ++i) {
41274         int M = ShuffleMask[i];
41275         if (!DemandedElts[i] || ShuffleUndef[i])
41276           continue;
41277         int OpIdx = M / NumElts;
41278         int EltIdx = M % NumElts;
41279         if (M < 0 || EltIdx != i) {
41280           IdentityOp.clearAllBits();
41281           break;
41282         }
41283         IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
41284         if (IdentityOp == 0)
41285           break;
41286       }
41287       assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
41288              "Multiple identity shuffles detected");
41289 
41290       if (IdentityOp != 0)
41291         return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
41292     }
41293   }
41294 
41295   return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
41296       Op, DemandedBits, DemandedElts, DAG, Depth);
41297 }
41298 
41299 bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
41300                                                   const APInt &DemandedElts,
41301                                                   APInt &UndefElts,
41302                                                   unsigned Depth) const {
41303   unsigned NumElts = DemandedElts.getBitWidth();
41304   unsigned Opc = Op.getOpcode();
41305 
41306   switch (Opc) {
41307   case X86ISD::VBROADCAST:
41308   case X86ISD::VBROADCAST_LOAD:
41309     // TODO: Permit vXi64 types on 32-bit targets.
41310     if (isTypeLegal(Op.getValueType().getVectorElementType())) {
41311       UndefElts = APInt::getNullValue(NumElts);
41312       return true;
41313     }
41314     return false;
41315   }
41316 
41317   return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
41318                                                    Depth);
41319 }
41320 
41321 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
41322 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
41323 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
41324                                       bool AllowTruncate) {
41325   switch (Src.getOpcode()) {
41326   case ISD::TRUNCATE:
41327     if (!AllowTruncate)
41328       return false;
41329     LLVM_FALLTHROUGH;
41330   case ISD::SETCC:
41331     return Src.getOperand(0).getValueSizeInBits() == Size;
41332   case ISD::AND:
41333   case ISD::XOR:
41334   case ISD::OR:
41335     return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
41336            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
41337   }
41338   return false;
41339 }
41340 
41341 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
41342 static unsigned getAltBitOpcode(unsigned Opcode) {
41343   switch(Opcode) {
41344   case ISD::AND: return X86ISD::FAND;
41345   case ISD::OR: return X86ISD::FOR;
41346   case ISD::XOR: return X86ISD::FXOR;
41347   case X86ISD::ANDNP: return X86ISD::FANDN;
41348   }
41349   llvm_unreachable("Unknown bitwise opcode");
41350 }
41351 
41352 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
41353 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
41354                                           const SDLoc &DL) {
41355   EVT SrcVT = Src.getValueType();
41356   if (SrcVT != MVT::v4i1)
41357     return SDValue();
41358 
41359   switch (Src.getOpcode()) {
41360   case ISD::SETCC:
41361     if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
41362         ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
41363         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
41364       SDValue Op0 = Src.getOperand(0);
41365       if (ISD::isNormalLoad(Op0.getNode()))
41366         return DAG.getBitcast(MVT::v4f32, Op0);
41367       if (Op0.getOpcode() == ISD::BITCAST &&
41368           Op0.getOperand(0).getValueType() == MVT::v4f32)
41369         return Op0.getOperand(0);
41370     }
41371     break;
41372   case ISD::AND:
41373   case ISD::XOR:
41374   case ISD::OR: {
41375     SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
41376     SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
41377     if (Op0 && Op1)
41378       return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
41379                          Op1);
41380     break;
41381   }
41382   }
41383   return SDValue();
41384 }
41385 
41386 // Helper to push sign extension of vXi1 SETCC result through bitops.
41387 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
41388                                           SDValue Src, const SDLoc &DL) {
41389   switch (Src.getOpcode()) {
41390   case ISD::SETCC:
41391   case ISD::TRUNCATE:
41392     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
41393   case ISD::AND:
41394   case ISD::XOR:
41395   case ISD::OR:
41396     return DAG.getNode(
41397         Src.getOpcode(), DL, SExtVT,
41398         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
41399         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
41400   }
41401   llvm_unreachable("Unexpected node type for vXi1 sign extension");
41402 }
41403 
41404 // Try to match patterns such as
41405 // (i16 bitcast (v16i1 x))
41406 // ->
41407 // (i16 movmsk (16i8 sext (v16i1 x)))
41408 // before the illegal vector is scalarized on subtargets that don't have legal
41409 // vxi1 types.
41410 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
41411                                   const SDLoc &DL,
41412                                   const X86Subtarget &Subtarget) {
41413   EVT SrcVT = Src.getValueType();
41414   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
41415     return SDValue();
41416 
41417   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
41418   // legalization destroys the v4i32 type.
41419   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
41420     if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
41421       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
41422                       DAG.getBitcast(MVT::v4f32, V));
41423       return DAG.getZExtOrTrunc(V, DL, VT);
41424     }
41425   }
41426 
41427   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
41428   // movmskb even with avx512. This will be better than truncating to vXi1 and
41429   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
41430   // vpcmpeqb/vpcmpgtb.
41431   bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
41432                       (Src.getOperand(0).getValueType() == MVT::v16i8 ||
41433                        Src.getOperand(0).getValueType() == MVT::v32i8 ||
41434                        Src.getOperand(0).getValueType() == MVT::v64i8);
41435 
41436   // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
41437   // directly with vpmovmskb/vmovmskps/vmovmskpd.
41438   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
41439       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
41440       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
41441     EVT CmpVT = Src.getOperand(0).getValueType();
41442     EVT EltVT = CmpVT.getVectorElementType();
41443     if (CmpVT.getSizeInBits() <= 256 &&
41444         (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
41445       PreferMovMsk = true;
41446   }
41447 
41448   // With AVX512 vxi1 types are legal and we prefer using k-regs.
41449   // MOVMSK is supported in SSE2 or later.
41450   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
41451     return SDValue();
41452 
41453   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
41454   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
41455   // v8i16 and v16i16.
41456   // For these two cases, we can shuffle the upper element bytes to a
41457   // consecutive sequence at the start of the vector and treat the results as
41458   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
41459   // for v16i16 this is not the case, because the shuffle is expensive, so we
41460   // avoid sign-extending to this type entirely.
41461   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
41462   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
41463   MVT SExtVT;
41464   bool PropagateSExt = false;
41465   switch (SrcVT.getSimpleVT().SimpleTy) {
41466   default:
41467     return SDValue();
41468   case MVT::v2i1:
41469     SExtVT = MVT::v2i64;
41470     break;
41471   case MVT::v4i1:
41472     SExtVT = MVT::v4i32;
41473     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
41474     // sign-extend to a 256-bit operation to avoid truncation.
41475     if (Subtarget.hasAVX() &&
41476         checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
41477       SExtVT = MVT::v4i64;
41478       PropagateSExt = true;
41479     }
41480     break;
41481   case MVT::v8i1:
41482     SExtVT = MVT::v8i16;
41483     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
41484     // sign-extend to a 256-bit operation to match the compare.
41485     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
41486     // 256-bit because the shuffle is cheaper than sign extending the result of
41487     // the compare.
41488     if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
41489                                checkBitcastSrcVectorSize(Src, 512, true))) {
41490       SExtVT = MVT::v8i32;
41491       PropagateSExt = true;
41492     }
41493     break;
41494   case MVT::v16i1:
41495     SExtVT = MVT::v16i8;
41496     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
41497     // it is not profitable to sign-extend to 256-bit because this will
41498     // require an extra cross-lane shuffle which is more expensive than
41499     // truncating the result of the compare to 128-bits.
41500     break;
41501   case MVT::v32i1:
41502     SExtVT = MVT::v32i8;
41503     break;
41504   case MVT::v64i1:
41505     // If we have AVX512F, but not AVX512BW and the input is truncated from
41506     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
41507     if (Subtarget.hasAVX512()) {
41508       if (Subtarget.hasBWI())
41509         return SDValue();
41510       SExtVT = MVT::v64i8;
41511       break;
41512     }
41513     // Split if this is a <64 x i8> comparison result.
41514     if (checkBitcastSrcVectorSize(Src, 512, false)) {
41515       SExtVT = MVT::v64i8;
41516       break;
41517     }
41518     return SDValue();
41519   };
41520 
41521   SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
41522                             : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
41523 
41524   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
41525     V = getPMOVMSKB(DL, V, DAG, Subtarget);
41526   } else {
41527     if (SExtVT == MVT::v8i16)
41528       V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
41529                       DAG.getUNDEF(MVT::v8i16));
41530     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
41531   }
41532 
41533   EVT IntVT =
41534       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
41535   V = DAG.getZExtOrTrunc(V, DL, IntVT);
41536   return DAG.getBitcast(VT, V);
41537 }
41538 
41539 // Convert a vXi1 constant build vector to the same width scalar integer.
41540 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
41541   EVT SrcVT = Op.getValueType();
41542   assert(SrcVT.getVectorElementType() == MVT::i1 &&
41543          "Expected a vXi1 vector");
41544   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
41545          "Expected a constant build vector");
41546 
41547   APInt Imm(SrcVT.getVectorNumElements(), 0);
41548   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
41549     SDValue In = Op.getOperand(Idx);
41550     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
41551       Imm.setBit(Idx);
41552   }
41553   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
41554   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
41555 }
41556 
41557 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
41558                                            TargetLowering::DAGCombinerInfo &DCI,
41559                                            const X86Subtarget &Subtarget) {
41560   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
41561 
41562   if (!DCI.isBeforeLegalizeOps())
41563     return SDValue();
41564 
41565   // Only do this if we have k-registers.
41566   if (!Subtarget.hasAVX512())
41567     return SDValue();
41568 
41569   EVT DstVT = N->getValueType(0);
41570   SDValue Op = N->getOperand(0);
41571   EVT SrcVT = Op.getValueType();
41572 
41573   if (!Op.hasOneUse())
41574     return SDValue();
41575 
41576   // Look for logic ops.
41577   if (Op.getOpcode() != ISD::AND &&
41578       Op.getOpcode() != ISD::OR &&
41579       Op.getOpcode() != ISD::XOR)
41580     return SDValue();
41581 
41582   // Make sure we have a bitcast between mask registers and a scalar type.
41583   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
41584         DstVT.isScalarInteger()) &&
41585       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
41586         SrcVT.isScalarInteger()))
41587     return SDValue();
41588 
41589   SDValue LHS = Op.getOperand(0);
41590   SDValue RHS = Op.getOperand(1);
41591 
41592   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
41593       LHS.getOperand(0).getValueType() == DstVT)
41594     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
41595                        DAG.getBitcast(DstVT, RHS));
41596 
41597   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
41598       RHS.getOperand(0).getValueType() == DstVT)
41599     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
41600                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
41601 
41602   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
41603   // Most of these have to move a constant from the scalar domain anyway.
41604   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
41605     RHS = combinevXi1ConstantToInteger(RHS, DAG);
41606     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
41607                        DAG.getBitcast(DstVT, LHS), RHS);
41608   }
41609 
41610   return SDValue();
41611 }
41612 
41613 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
41614                                     const X86Subtarget &Subtarget) {
41615   SDLoc DL(BV);
41616   unsigned NumElts = BV->getNumOperands();
41617   SDValue Splat = BV->getSplatValue();
41618 
41619   // Build MMX element from integer GPR or SSE float values.
41620   auto CreateMMXElement = [&](SDValue V) {
41621     if (V.isUndef())
41622       return DAG.getUNDEF(MVT::x86mmx);
41623     if (V.getValueType().isFloatingPoint()) {
41624       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
41625         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
41626         V = DAG.getBitcast(MVT::v2i64, V);
41627         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
41628       }
41629       V = DAG.getBitcast(MVT::i32, V);
41630     } else {
41631       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
41632     }
41633     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
41634   };
41635 
41636   // Convert build vector ops to MMX data in the bottom elements.
41637   SmallVector<SDValue, 8> Ops;
41638 
41639   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41640 
41641   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
41642   if (Splat) {
41643     if (Splat.isUndef())
41644       return DAG.getUNDEF(MVT::x86mmx);
41645 
41646     Splat = CreateMMXElement(Splat);
41647 
41648     if (Subtarget.hasSSE1()) {
41649       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
41650       if (NumElts == 8)
41651         Splat = DAG.getNode(
41652             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
41653             DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
41654                                   TLI.getPointerTy(DAG.getDataLayout())),
41655             Splat, Splat);
41656 
41657       // Use PSHUFW to repeat 16-bit elements.
41658       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
41659       return DAG.getNode(
41660           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
41661           DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
41662                                 TLI.getPointerTy(DAG.getDataLayout())),
41663           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
41664     }
41665     Ops.append(NumElts, Splat);
41666   } else {
41667     for (unsigned i = 0; i != NumElts; ++i)
41668       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
41669   }
41670 
41671   // Use tree of PUNPCKLs to build up general MMX vector.
41672   while (Ops.size() > 1) {
41673     unsigned NumOps = Ops.size();
41674     unsigned IntrinOp =
41675         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
41676                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
41677                                     : Intrinsic::x86_mmx_punpcklbw));
41678     SDValue Intrin = DAG.getTargetConstant(
41679         IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
41680     for (unsigned i = 0; i != NumOps; i += 2)
41681       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
41682                                Ops[i], Ops[i + 1]);
41683     Ops.resize(NumOps / 2);
41684   }
41685 
41686   return Ops[0];
41687 }
41688 
41689 // Recursive function that attempts to find if a bool vector node was originally
41690 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
41691 // integer. If so, replace the scalar ops with bool vector equivalents back down
41692 // the chain.
41693 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
41694                                           SelectionDAG &DAG,
41695                                           const X86Subtarget &Subtarget) {
41696   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41697   unsigned Opc = V.getOpcode();
41698   switch (Opc) {
41699   case ISD::BITCAST: {
41700     // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
41701     SDValue Src = V.getOperand(0);
41702     EVT SrcVT = Src.getValueType();
41703     if (SrcVT.isVector() || SrcVT.isFloatingPoint())
41704       return DAG.getBitcast(VT, Src);
41705     break;
41706   }
41707   case ISD::TRUNCATE: {
41708     // If we find a suitable source, a truncated scalar becomes a subvector.
41709     SDValue Src = V.getOperand(0);
41710     EVT NewSrcVT =
41711         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
41712     if (TLI.isTypeLegal(NewSrcVT))
41713       if (SDValue N0 =
41714               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
41715         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
41716                            DAG.getIntPtrConstant(0, DL));
41717     break;
41718   }
41719   case ISD::ANY_EXTEND:
41720   case ISD::ZERO_EXTEND: {
41721     // If we find a suitable source, an extended scalar becomes a subvector.
41722     SDValue Src = V.getOperand(0);
41723     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
41724                                     Src.getScalarValueSizeInBits());
41725     if (TLI.isTypeLegal(NewSrcVT))
41726       if (SDValue N0 =
41727               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
41728         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
41729                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
41730                                                   : DAG.getConstant(0, DL, VT),
41731                            N0, DAG.getIntPtrConstant(0, DL));
41732     break;
41733   }
41734   case ISD::OR: {
41735     // If we find suitable sources, we can just move an OR to the vector domain.
41736     SDValue Src0 = V.getOperand(0);
41737     SDValue Src1 = V.getOperand(1);
41738     if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
41739       if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
41740         return DAG.getNode(Opc, DL, VT, N0, N1);
41741     break;
41742   }
41743   case ISD::SHL: {
41744     // If we find a suitable source, a SHL becomes a KSHIFTL.
41745     SDValue Src0 = V.getOperand(0);
41746     if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
41747         ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
41748       break;
41749 
41750     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
41751       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
41752         return DAG.getNode(
41753             X86ISD::KSHIFTL, DL, VT, N0,
41754             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
41755     break;
41756   }
41757   }
41758   return SDValue();
41759 }
41760 
41761 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
41762                               TargetLowering::DAGCombinerInfo &DCI,
41763                               const X86Subtarget &Subtarget) {
41764   SDValue N0 = N->getOperand(0);
41765   EVT VT = N->getValueType(0);
41766   EVT SrcVT = N0.getValueType();
41767   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41768 
41769   // Try to match patterns such as
41770   // (i16 bitcast (v16i1 x))
41771   // ->
41772   // (i16 movmsk (16i8 sext (v16i1 x)))
41773   // before the setcc result is scalarized on subtargets that don't have legal
41774   // vxi1 types.
41775   if (DCI.isBeforeLegalize()) {
41776     SDLoc dl(N);
41777     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
41778       return V;
41779 
41780     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
41781     // type, widen both sides to avoid a trip through memory.
41782     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
41783         Subtarget.hasAVX512()) {
41784       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
41785       N0 = DAG.getBitcast(MVT::v8i1, N0);
41786       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
41787                          DAG.getIntPtrConstant(0, dl));
41788     }
41789 
41790     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
41791     // type, widen both sides to avoid a trip through memory.
41792     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
41793         Subtarget.hasAVX512()) {
41794       // Use zeros for the widening if we already have some zeroes. This can
41795       // allow SimplifyDemandedBits to remove scalar ANDs that may be down
41796       // stream of this.
41797       // FIXME: It might make sense to detect a concat_vectors with a mix of
41798       // zeroes and undef and turn it into insert_subvector for i1 vectors as
41799       // a separate combine. What we can't do is canonicalize the operands of
41800       // such a concat or we'll get into a loop with SimplifyDemandedBits.
41801       if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
41802         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
41803         if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
41804           SrcVT = LastOp.getValueType();
41805           unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
41806           SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
41807           Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
41808           N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
41809           N0 = DAG.getBitcast(MVT::i8, N0);
41810           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
41811         }
41812       }
41813 
41814       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
41815       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
41816       Ops[0] = N0;
41817       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
41818       N0 = DAG.getBitcast(MVT::i8, N0);
41819       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
41820     }
41821   } else {
41822     // If we're bitcasting from iX to vXi1, see if the integer originally
41823     // began as a vXi1 and whether we can remove the bitcast entirely.
41824     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
41825         SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
41826       if (SDValue V =
41827               combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
41828         return V;
41829     }
41830   }
41831 
41832   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
41833   // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
41834   // due to insert_subvector legalization on KNL. By promoting the copy to i16
41835   // we can help with known bits propagation from the vXi1 domain to the
41836   // scalar domain.
41837   if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
41838       !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41839       N0.getOperand(0).getValueType() == MVT::v16i1 &&
41840       isNullConstant(N0.getOperand(1)))
41841     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
41842                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
41843 
41844   // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
41845   // and the vbroadcast_load are both integer or both fp. In some cases this
41846   // will remove the bitcast entirely.
41847   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
41848        VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
41849     auto *BCast = cast<MemIntrinsicSDNode>(N0);
41850     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
41851     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
41852     // Don't swap i8/i16 since don't have fp types that size.
41853     if (MemSize >= 32) {
41854       MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
41855                                        : MVT::getIntegerVT(MemSize);
41856       MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
41857                                         : MVT::getIntegerVT(SrcVTSize);
41858       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
41859 
41860       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
41861       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
41862       SDValue ResNode =
41863           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
41864                                   MemVT, BCast->getMemOperand());
41865       DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
41866       return DAG.getBitcast(VT, ResNode);
41867     }
41868   }
41869 
41870   // Since MMX types are special and don't usually play with other vector types,
41871   // it's better to handle them early to be sure we emit efficient code by
41872   // avoiding store-load conversions.
41873   if (VT == MVT::x86mmx) {
41874     // Detect MMX constant vectors.
41875     APInt UndefElts;
41876     SmallVector<APInt, 1> EltBits;
41877     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
41878       SDLoc DL(N0);
41879       // Handle zero-extension of i32 with MOVD.
41880       if (EltBits[0].countLeadingZeros() >= 32)
41881         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
41882                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
41883       // Else, bitcast to a double.
41884       // TODO - investigate supporting sext 32-bit immediates on x86_64.
41885       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
41886       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
41887     }
41888 
41889     // Detect bitcasts to x86mmx low word.
41890     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
41891         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
41892         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
41893       bool LowUndef = true, AllUndefOrZero = true;
41894       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
41895         SDValue Op = N0.getOperand(i);
41896         LowUndef &= Op.isUndef() || (i >= e/2);
41897         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
41898       }
41899       if (AllUndefOrZero) {
41900         SDValue N00 = N0.getOperand(0);
41901         SDLoc dl(N00);
41902         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
41903                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
41904         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
41905       }
41906     }
41907 
41908     // Detect bitcasts of 64-bit build vectors and convert to a
41909     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
41910     // lowest element.
41911     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
41912         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
41913          SrcVT == MVT::v8i8))
41914       return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
41915 
41916     // Detect bitcasts between element or subvector extraction to x86mmx.
41917     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
41918          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
41919         isNullConstant(N0.getOperand(1))) {
41920       SDValue N00 = N0.getOperand(0);
41921       if (N00.getValueType().is128BitVector())
41922         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
41923                            DAG.getBitcast(MVT::v2i64, N00));
41924     }
41925 
41926     // Detect bitcasts from FP_TO_SINT to x86mmx.
41927     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
41928       SDLoc DL(N0);
41929       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
41930                                 DAG.getUNDEF(MVT::v2i32));
41931       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
41932                          DAG.getBitcast(MVT::v2i64, Res));
41933     }
41934   }
41935 
41936   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
41937   // most of these to scalar anyway.
41938   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
41939       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
41940       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
41941     return combinevXi1ConstantToInteger(N0, DAG);
41942   }
41943 
41944   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
41945       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
41946       isa<ConstantSDNode>(N0)) {
41947     auto *C = cast<ConstantSDNode>(N0);
41948     if (C->isAllOnes())
41949       return DAG.getConstant(1, SDLoc(N0), VT);
41950     if (C->isZero())
41951       return DAG.getConstant(0, SDLoc(N0), VT);
41952   }
41953 
41954   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
41955   // Turn it into a sign bit compare that produces a k-register. This avoids
41956   // a trip through a GPR.
41957   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
41958       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
41959       isPowerOf2_32(VT.getVectorNumElements())) {
41960     unsigned NumElts = VT.getVectorNumElements();
41961     SDValue Src = N0;
41962 
41963     // Peek through truncate.
41964     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
41965       Src = N0.getOperand(0);
41966 
41967     if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
41968       SDValue MovmskIn = Src.getOperand(0);
41969       MVT MovmskVT = MovmskIn.getSimpleValueType();
41970       unsigned MovMskElts = MovmskVT.getVectorNumElements();
41971 
41972       // We allow extra bits of the movmsk to be used since they are known zero.
41973       // We can't convert a VPMOVMSKB without avx512bw.
41974       if (MovMskElts <= NumElts &&
41975           (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
41976         EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
41977         MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
41978         SDLoc dl(N);
41979         MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
41980         SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
41981                                    DAG.getConstant(0, dl, IntVT), ISD::SETLT);
41982         if (EVT(CmpVT) == VT)
41983           return Cmp;
41984 
41985         // Pad with zeroes up to original VT to replace the zeroes that were
41986         // being used from the MOVMSK.
41987         unsigned NumConcats = NumElts / MovMskElts;
41988         SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
41989         Ops[0] = Cmp;
41990         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
41991       }
41992     }
41993   }
41994 
41995   // Try to remove bitcasts from input and output of mask arithmetic to
41996   // remove GPR<->K-register crossings.
41997   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
41998     return V;
41999 
42000   // Convert a bitcasted integer logic operation that has one bitcasted
42001   // floating-point operand into a floating-point logic operation. This may
42002   // create a load of a constant, but that is cheaper than materializing the
42003   // constant in an integer register and transferring it to an SSE register or
42004   // transferring the SSE operand to integer register and back.
42005   unsigned FPOpcode;
42006   switch (N0.getOpcode()) {
42007     case ISD::AND: FPOpcode = X86ISD::FAND; break;
42008     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
42009     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
42010     default: return SDValue();
42011   }
42012 
42013   // Check if we have a bitcast from another integer type as well.
42014   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
42015         (Subtarget.hasSSE2() && VT == MVT::f64) ||
42016         (Subtarget.hasFP16() && VT == MVT::f16) ||
42017         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
42018          TLI.isTypeLegal(VT))))
42019     return SDValue();
42020 
42021   SDValue LogicOp0 = N0.getOperand(0);
42022   SDValue LogicOp1 = N0.getOperand(1);
42023   SDLoc DL0(N0);
42024 
42025   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
42026   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
42027       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
42028       LogicOp0.getOperand(0).getValueType() == VT &&
42029       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
42030     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
42031     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42032     return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
42033   }
42034   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
42035   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
42036       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
42037       LogicOp1.getOperand(0).getValueType() == VT &&
42038       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
42039     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
42040     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42041     return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
42042   }
42043 
42044   return SDValue();
42045 }
42046 
42047 // (mul (zext a), (sext, b))
42048 static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
42049                          SDValue &Op1) {
42050   Op0 = Mul.getOperand(0);
42051   Op1 = Mul.getOperand(1);
42052 
42053   // The operand1 should be signed extend
42054   if (Op0.getOpcode() == ISD::SIGN_EXTEND)
42055     std::swap(Op0, Op1);
42056 
42057   auto IsFreeTruncation = [](SDValue &Op) -> bool {
42058     if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
42059          Op.getOpcode() == ISD::SIGN_EXTEND) &&
42060         Op.getOperand(0).getScalarValueSizeInBits() <= 8)
42061       return true;
42062 
42063     auto *BV = dyn_cast<BuildVectorSDNode>(Op);
42064     return (BV && BV->isConstant());
42065   };
42066 
42067   // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
42068   // value, we need to check Op0 is zero extended value. Op1 should be signed
42069   // value, so we just check the signed bits.
42070   if ((IsFreeTruncation(Op0) &&
42071        DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
42072       (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
42073     return true;
42074 
42075   return false;
42076 }
42077 
42078 // Given a ABS node, detect the following pattern:
42079 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
42080 // This is useful as it is the input into a SAD pattern.
42081 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
42082   SDValue AbsOp1 = Abs->getOperand(0);
42083   if (AbsOp1.getOpcode() != ISD::SUB)
42084     return false;
42085 
42086   Op0 = AbsOp1.getOperand(0);
42087   Op1 = AbsOp1.getOperand(1);
42088 
42089   // Check if the operands of the sub are zero-extended from vectors of i8.
42090   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
42091       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
42092       Op1.getOpcode() != ISD::ZERO_EXTEND ||
42093       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
42094     return false;
42095 
42096   return true;
42097 }
42098 
42099 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
42100                               unsigned &LogBias, const SDLoc &DL,
42101                               const X86Subtarget &Subtarget) {
42102   // Extend or truncate to MVT::i8 first.
42103   MVT Vi8VT =
42104       MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
42105   LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
42106   RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
42107 
42108   // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
42109   // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
42110   // The src A, B element type is i8, but the dst C element type is i32.
42111   // When we calculate the reduce stage, we use src vector type vXi8 for it
42112   // so we need logbias 2 to avoid extra 2 stages.
42113   LogBias = 2;
42114 
42115   unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
42116   if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
42117     RegSize = std::max(512u, RegSize);
42118 
42119   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
42120   // fill in the missing vector elements with 0.
42121   unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
42122   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
42123   Ops[0] = LHS;
42124   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
42125   SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42126   Ops[0] = RHS;
42127   SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42128 
42129   // Actually build the DotProduct, split as 256/512 bits for
42130   // AVXVNNI/AVX512VNNI.
42131   auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42132                        ArrayRef<SDValue> Ops) {
42133     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
42134     return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
42135   };
42136   MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
42137   SDValue Zero = DAG.getConstant(0, DL, DpVT);
42138 
42139   return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
42140                           DpBuilder, false);
42141 }
42142 
42143 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
42144 // to these zexts.
42145 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
42146                             const SDValue &Zext1, const SDLoc &DL,
42147                             const X86Subtarget &Subtarget) {
42148   // Find the appropriate width for the PSADBW.
42149   EVT InVT = Zext0.getOperand(0).getValueType();
42150   unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
42151 
42152   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
42153   // fill in the missing vector elements with 0.
42154   unsigned NumConcat = RegSize / InVT.getSizeInBits();
42155   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
42156   Ops[0] = Zext0.getOperand(0);
42157   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
42158   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42159   Ops[0] = Zext1.getOperand(0);
42160   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
42161 
42162   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
42163   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42164                           ArrayRef<SDValue> Ops) {
42165     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
42166     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
42167   };
42168   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
42169   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
42170                           PSADBWBuilder);
42171 }
42172 
42173 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
42174 // PHMINPOSUW.
42175 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
42176                                       const X86Subtarget &Subtarget) {
42177   // Bail without SSE41.
42178   if (!Subtarget.hasSSE41())
42179     return SDValue();
42180 
42181   EVT ExtractVT = Extract->getValueType(0);
42182   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
42183     return SDValue();
42184 
42185   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
42186   ISD::NodeType BinOp;
42187   SDValue Src = DAG.matchBinOpReduction(
42188       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
42189   if (!Src)
42190     return SDValue();
42191 
42192   EVT SrcVT = Src.getValueType();
42193   EVT SrcSVT = SrcVT.getScalarType();
42194   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
42195     return SDValue();
42196 
42197   SDLoc DL(Extract);
42198   SDValue MinPos = Src;
42199 
42200   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
42201   while (SrcVT.getSizeInBits() > 128) {
42202     SDValue Lo, Hi;
42203     std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
42204     SrcVT = Lo.getValueType();
42205     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
42206   }
42207   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
42208           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
42209          "Unexpected value type");
42210 
42211   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
42212   // to flip the value accordingly.
42213   SDValue Mask;
42214   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
42215   if (BinOp == ISD::SMAX)
42216     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
42217   else if (BinOp == ISD::SMIN)
42218     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
42219   else if (BinOp == ISD::UMAX)
42220     Mask = DAG.getAllOnesConstant(DL, SrcVT);
42221 
42222   if (Mask)
42223     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
42224 
42225   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
42226   // shuffling each upper element down and insert zeros. This means that the
42227   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
42228   // ready for the PHMINPOS.
42229   if (ExtractVT == MVT::i8) {
42230     SDValue Upper = DAG.getVectorShuffle(
42231         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
42232         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
42233     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
42234   }
42235 
42236   // Perform the PHMINPOS on a v8i16 vector,
42237   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
42238   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
42239   MinPos = DAG.getBitcast(SrcVT, MinPos);
42240 
42241   if (Mask)
42242     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
42243 
42244   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
42245                      DAG.getIntPtrConstant(0, DL));
42246 }
42247 
42248 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
42249 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
42250                                          const X86Subtarget &Subtarget) {
42251   // Bail without SSE2.
42252   if (!Subtarget.hasSSE2())
42253     return SDValue();
42254 
42255   EVT ExtractVT = Extract->getValueType(0);
42256   unsigned BitWidth = ExtractVT.getSizeInBits();
42257   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
42258       ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
42259     return SDValue();
42260 
42261   // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
42262   ISD::NodeType BinOp;
42263   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
42264   if (!Match && ExtractVT == MVT::i1)
42265     Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
42266   if (!Match)
42267     return SDValue();
42268 
42269   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
42270   // which we can't support here for now.
42271   if (Match.getScalarValueSizeInBits() != BitWidth)
42272     return SDValue();
42273 
42274   SDValue Movmsk;
42275   SDLoc DL(Extract);
42276   EVT MatchVT = Match.getValueType();
42277   unsigned NumElts = MatchVT.getVectorNumElements();
42278   unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
42279   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42280 
42281   if (ExtractVT == MVT::i1) {
42282     // Special case for (pre-legalization) vXi1 reductions.
42283     if (NumElts > 64 || !isPowerOf2_32(NumElts))
42284       return SDValue();
42285     if (TLI.isTypeLegal(MatchVT)) {
42286       // If this is a legal AVX512 predicate type then we can just bitcast.
42287       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42288       Movmsk = DAG.getBitcast(MovmskVT, Match);
42289     } else {
42290       // For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).
42291       if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&
42292           cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
42293               ISD::CondCode::SETEQ) {
42294         EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();
42295         if (VecSVT != MVT::i8) {
42296           NumElts *= VecSVT.getSizeInBits() / 8;
42297           EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);
42298           MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
42299           Match = DAG.getSetCC(
42300               DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
42301               DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
42302         }
42303       }
42304 
42305       // Use combineBitcastvxi1 to create the MOVMSK.
42306       while (NumElts > MaxElts) {
42307         SDValue Lo, Hi;
42308         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
42309         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
42310         NumElts /= 2;
42311       }
42312       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42313       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
42314     }
42315     if (!Movmsk)
42316       return SDValue();
42317     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
42318   } else {
42319     // FIXME: Better handling of k-registers or 512-bit vectors?
42320     unsigned MatchSizeInBits = Match.getValueSizeInBits();
42321     if (!(MatchSizeInBits == 128 ||
42322           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
42323       return SDValue();
42324 
42325     // Make sure this isn't a vector of 1 element. The perf win from using
42326     // MOVMSK diminishes with less elements in the reduction, but it is
42327     // generally better to get the comparison over to the GPRs as soon as
42328     // possible to reduce the number of vector ops.
42329     if (Match.getValueType().getVectorNumElements() < 2)
42330       return SDValue();
42331 
42332     // Check that we are extracting a reduction of all sign bits.
42333     if (DAG.ComputeNumSignBits(Match) != BitWidth)
42334       return SDValue();
42335 
42336     if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
42337       SDValue Lo, Hi;
42338       std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
42339       Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
42340       MatchSizeInBits = Match.getValueSizeInBits();
42341     }
42342 
42343     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
42344     MVT MaskSrcVT;
42345     if (64 == BitWidth || 32 == BitWidth)
42346       MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
42347                                    MatchSizeInBits / BitWidth);
42348     else
42349       MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
42350 
42351     SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
42352     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
42353     NumElts = MaskSrcVT.getVectorNumElements();
42354   }
42355   assert((NumElts <= 32 || NumElts == 64) &&
42356          "Not expecting more than 64 elements");
42357 
42358   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
42359   if (BinOp == ISD::XOR) {
42360     // parity -> (PARITY(MOVMSK X))
42361     SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
42362     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
42363   }
42364 
42365   SDValue CmpC;
42366   ISD::CondCode CondCode;
42367   if (BinOp == ISD::OR) {
42368     // any_of -> MOVMSK != 0
42369     CmpC = DAG.getConstant(0, DL, CmpVT);
42370     CondCode = ISD::CondCode::SETNE;
42371   } else {
42372     // all_of -> MOVMSK == ((1 << NumElts) - 1)
42373     CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
42374                            DL, CmpVT);
42375     CondCode = ISD::CondCode::SETEQ;
42376   }
42377 
42378   // The setcc produces an i8 of 0/1, so extend that to the result width and
42379   // negate to get the final 0/-1 mask value.
42380   EVT SetccVT =
42381       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
42382   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
42383   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
42384   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
42385   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
42386 }
42387 
42388 static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
42389                                       const X86Subtarget &Subtarget) {
42390   if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
42391     return SDValue();
42392 
42393   EVT ExtractVT = Extract->getValueType(0);
42394   // Verify the type we're extracting is i32, as the output element type of
42395   // vpdpbusd is i32.
42396   if (ExtractVT != MVT::i32)
42397     return SDValue();
42398 
42399   EVT VT = Extract->getOperand(0).getValueType();
42400   if (!isPowerOf2_32(VT.getVectorNumElements()))
42401     return SDValue();
42402 
42403   // Match shuffle + add pyramid.
42404   ISD::NodeType BinOp;
42405   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
42406 
42407   // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
42408   // done by vpdpbusd compute a signed 16-bit product that will be sign extended
42409   // before adding into the accumulator.
42410   // TODO:
42411   // We also need to verify that the multiply has at least 2x the number of bits
42412   // of the input. We shouldn't match
42413   // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
42414   // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
42415   //   Root = Root.getOperand(0);
42416 
42417   // If there was a match, we want Root to be a mul.
42418   if (!Root || Root.getOpcode() != ISD::MUL)
42419     return SDValue();
42420 
42421   // Check whether we have an extend and mul pattern
42422   SDValue LHS, RHS;
42423   if (!detectExtMul(DAG, Root, LHS, RHS))
42424     return SDValue();
42425 
42426   // Create the dot product instruction.
42427   SDLoc DL(Extract);
42428   unsigned StageBias;
42429   SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
42430 
42431   // If the original vector was wider than 4 elements, sum over the results
42432   // in the DP vector.
42433   unsigned Stages = Log2_32(VT.getVectorNumElements());
42434   EVT DpVT = DP.getValueType();
42435 
42436   if (Stages > StageBias) {
42437     unsigned DpElems = DpVT.getVectorNumElements();
42438 
42439     for (unsigned i = Stages - StageBias; i > 0; --i) {
42440       SmallVector<int, 16> Mask(DpElems, -1);
42441       for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
42442         Mask[j] = MaskEnd + j;
42443 
42444       SDValue Shuffle =
42445           DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
42446       DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
42447     }
42448   }
42449 
42450   // Return the lowest ExtractSizeInBits bits.
42451   EVT ResVT =
42452       EVT::getVectorVT(*DAG.getContext(), ExtractVT,
42453                        DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
42454   DP = DAG.getBitcast(ResVT, DP);
42455   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
42456                      Extract->getOperand(1));
42457 }
42458 
42459 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
42460                                       const X86Subtarget &Subtarget) {
42461   // PSADBW is only supported on SSE2 and up.
42462   if (!Subtarget.hasSSE2())
42463     return SDValue();
42464 
42465   EVT ExtractVT = Extract->getValueType(0);
42466   // Verify the type we're extracting is either i32 or i64.
42467   // FIXME: Could support other types, but this is what we have coverage for.
42468   if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
42469     return SDValue();
42470 
42471   EVT VT = Extract->getOperand(0).getValueType();
42472   if (!isPowerOf2_32(VT.getVectorNumElements()))
42473     return SDValue();
42474 
42475   // Match shuffle + add pyramid.
42476   ISD::NodeType BinOp;
42477   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
42478 
42479   // The operand is expected to be zero extended from i8
42480   // (verified in detectZextAbsDiff).
42481   // In order to convert to i64 and above, additional any/zero/sign
42482   // extend is expected.
42483   // The zero extend from 32 bit has no mathematical effect on the result.
42484   // Also the sign extend is basically zero extend
42485   // (extends the sign bit which is zero).
42486   // So it is correct to skip the sign/zero extend instruction.
42487   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
42488                Root.getOpcode() == ISD::ZERO_EXTEND ||
42489                Root.getOpcode() == ISD::ANY_EXTEND))
42490     Root = Root.getOperand(0);
42491 
42492   // If there was a match, we want Root to be a select that is the root of an
42493   // abs-diff pattern.
42494   if (!Root || Root.getOpcode() != ISD::ABS)
42495     return SDValue();
42496 
42497   // Check whether we have an abs-diff pattern feeding into the select.
42498   SDValue Zext0, Zext1;
42499   if (!detectZextAbsDiff(Root, Zext0, Zext1))
42500     return SDValue();
42501 
42502   // Create the SAD instruction.
42503   SDLoc DL(Extract);
42504   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
42505 
42506   // If the original vector was wider than 8 elements, sum over the results
42507   // in the SAD vector.
42508   unsigned Stages = Log2_32(VT.getVectorNumElements());
42509   EVT SadVT = SAD.getValueType();
42510   if (Stages > 3) {
42511     unsigned SadElems = SadVT.getVectorNumElements();
42512 
42513     for(unsigned i = Stages - 3; i > 0; --i) {
42514       SmallVector<int, 16> Mask(SadElems, -1);
42515       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
42516         Mask[j] = MaskEnd + j;
42517 
42518       SDValue Shuffle =
42519           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
42520       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
42521     }
42522   }
42523 
42524   unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
42525   // Return the lowest ExtractSizeInBits bits.
42526   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
42527                                SadVT.getSizeInBits() / ExtractSizeInBits);
42528   SAD = DAG.getBitcast(ResVT, SAD);
42529   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
42530                      Extract->getOperand(1));
42531 }
42532 
42533 // Attempt to peek through a target shuffle and extract the scalar from the
42534 // source.
42535 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
42536                                          TargetLowering::DAGCombinerInfo &DCI,
42537                                          const X86Subtarget &Subtarget) {
42538   if (DCI.isBeforeLegalizeOps())
42539     return SDValue();
42540 
42541   SDLoc dl(N);
42542   SDValue Src = N->getOperand(0);
42543   SDValue Idx = N->getOperand(1);
42544 
42545   EVT VT = N->getValueType(0);
42546   EVT SrcVT = Src.getValueType();
42547   EVT SrcSVT = SrcVT.getVectorElementType();
42548   unsigned SrcEltBits = SrcSVT.getSizeInBits();
42549   unsigned NumSrcElts = SrcVT.getVectorNumElements();
42550 
42551   // Don't attempt this for boolean mask vectors or unknown extraction indices.
42552   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
42553     return SDValue();
42554 
42555   const APInt &IdxC = N->getConstantOperandAPInt(1);
42556   if (IdxC.uge(NumSrcElts))
42557     return SDValue();
42558 
42559   SDValue SrcBC = peekThroughBitcasts(Src);
42560 
42561   // Handle extract(bitcast(broadcast(scalar_value))).
42562   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
42563     SDValue SrcOp = SrcBC.getOperand(0);
42564     EVT SrcOpVT = SrcOp.getValueType();
42565     if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
42566         (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
42567       unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
42568       unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
42569       // TODO support non-zero offsets.
42570       if (Offset == 0) {
42571         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
42572         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
42573         return SrcOp;
42574       }
42575     }
42576   }
42577 
42578   // If we're extracting a single element from a broadcast load and there are
42579   // no other users, just create a single load.
42580   if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
42581     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
42582     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
42583     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
42584         VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
42585       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
42586                                  MemIntr->getBasePtr(),
42587                                  MemIntr->getPointerInfo(),
42588                                  MemIntr->getOriginalAlign(),
42589                                  MemIntr->getMemOperand()->getFlags());
42590       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42591       return Load;
42592     }
42593   }
42594 
42595   // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
42596   // TODO: Move to DAGCombine?
42597   if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
42598       SrcBC.getValueType().isInteger() &&
42599       (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
42600       SrcBC.getScalarValueSizeInBits() ==
42601           SrcBC.getOperand(0).getValueSizeInBits()) {
42602     unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
42603     if (IdxC.ult(Scale)) {
42604       unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
42605       SDValue Scl = SrcBC.getOperand(0);
42606       EVT SclVT = Scl.getValueType();
42607       if (Offset) {
42608         Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
42609                           DAG.getShiftAmountConstant(Offset, SclVT, dl));
42610       }
42611       Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
42612       Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
42613       return Scl;
42614     }
42615   }
42616 
42617   // Handle extract(truncate(x)) for 0'th index.
42618   // TODO: Treat this as a faux shuffle?
42619   // TODO: When can we use this for general indices?
42620   if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
42621       (SrcVT.getSizeInBits() % 128) == 0) {
42622     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
42623     MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
42624     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
42625                        Idx);
42626   }
42627 
42628   // We can only legally extract other elements from 128-bit vectors and in
42629   // certain circumstances, depending on SSE-level.
42630   // TODO: Investigate float/double extraction if it will be just stored.
42631   auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
42632                                                  unsigned Idx) {
42633     EVT VecSVT = VecVT.getScalarType();
42634     if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
42635         (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
42636          VecSVT == MVT::i64)) {
42637       unsigned EltSizeInBits = VecSVT.getSizeInBits();
42638       unsigned NumEltsPerLane = 128 / EltSizeInBits;
42639       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
42640       unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
42641       VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
42642       Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
42643       Idx &= (NumEltsPerLane - 1);
42644     }
42645     if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
42646         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
42647       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
42648                          DAG.getBitcast(VecVT, Vec),
42649                          DAG.getIntPtrConstant(Idx, dl));
42650     }
42651     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
42652         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
42653       unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
42654       return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
42655                          DAG.getTargetConstant(Idx, dl, MVT::i8));
42656     }
42657     return SDValue();
42658   };
42659 
42660   // Resolve the target shuffle inputs and mask.
42661   SmallVector<int, 16> Mask;
42662   SmallVector<SDValue, 2> Ops;
42663   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
42664     return SDValue();
42665 
42666   // Shuffle inputs must be the same size as the result.
42667   if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
42668         return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
42669       }))
42670     return SDValue();
42671 
42672   // Attempt to narrow/widen the shuffle mask to the correct size.
42673   if (Mask.size() != NumSrcElts) {
42674     if ((NumSrcElts % Mask.size()) == 0) {
42675       SmallVector<int, 16> ScaledMask;
42676       int Scale = NumSrcElts / Mask.size();
42677       narrowShuffleMaskElts(Scale, Mask, ScaledMask);
42678       Mask = std::move(ScaledMask);
42679     } else if ((Mask.size() % NumSrcElts) == 0) {
42680       // Simplify Mask based on demanded element.
42681       int ExtractIdx = (int)IdxC.getZExtValue();
42682       int Scale = Mask.size() / NumSrcElts;
42683       int Lo = Scale * ExtractIdx;
42684       int Hi = Scale * (ExtractIdx + 1);
42685       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
42686         if (i < Lo || Hi <= i)
42687           Mask[i] = SM_SentinelUndef;
42688 
42689       SmallVector<int, 16> WidenedMask;
42690       while (Mask.size() > NumSrcElts &&
42691              canWidenShuffleElements(Mask, WidenedMask))
42692         Mask = std::move(WidenedMask);
42693     }
42694   }
42695 
42696   // If narrowing/widening failed, see if we can extract+zero-extend.
42697   int ExtractIdx;
42698   EVT ExtractVT;
42699   if (Mask.size() == NumSrcElts) {
42700     ExtractIdx = Mask[IdxC.getZExtValue()];
42701     ExtractVT = SrcVT;
42702   } else {
42703     unsigned Scale = Mask.size() / NumSrcElts;
42704     if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
42705       return SDValue();
42706     unsigned ScaledIdx = Scale * IdxC.getZExtValue();
42707     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
42708       return SDValue();
42709     ExtractIdx = Mask[ScaledIdx];
42710     EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
42711     ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
42712     assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
42713            "Failed to widen vector type");
42714   }
42715 
42716   // If the shuffle source element is undef/zero then we can just accept it.
42717   if (ExtractIdx == SM_SentinelUndef)
42718     return DAG.getUNDEF(VT);
42719 
42720   if (ExtractIdx == SM_SentinelZero)
42721     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
42722                                 : DAG.getConstant(0, dl, VT);
42723 
42724   SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
42725   ExtractIdx = ExtractIdx % Mask.size();
42726   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
42727     return DAG.getZExtOrTrunc(V, dl, VT);
42728 
42729   return SDValue();
42730 }
42731 
42732 /// Extracting a scalar FP value from vector element 0 is free, so extract each
42733 /// operand first, then perform the math as a scalar op.
42734 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
42735                                  const X86Subtarget &Subtarget) {
42736   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
42737   SDValue Vec = ExtElt->getOperand(0);
42738   SDValue Index = ExtElt->getOperand(1);
42739   EVT VT = ExtElt->getValueType(0);
42740   EVT VecVT = Vec.getValueType();
42741 
42742   // TODO: If this is a unary/expensive/expand op, allow extraction from a
42743   // non-zero element because the shuffle+scalar op will be cheaper?
42744   if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
42745     return SDValue();
42746 
42747   // Vector FP compares don't fit the pattern of FP math ops (propagate, not
42748   // extract, the condition code), so deal with those as a special-case.
42749   if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
42750     EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
42751     if (OpVT != MVT::f32 && OpVT != MVT::f64)
42752       return SDValue();
42753 
42754     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
42755     SDLoc DL(ExtElt);
42756     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
42757                                Vec.getOperand(0), Index);
42758     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
42759                                Vec.getOperand(1), Index);
42760     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
42761   }
42762 
42763   if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
42764       VT != MVT::f64)
42765     return SDValue();
42766 
42767   // Vector FP selects don't fit the pattern of FP math ops (because the
42768   // condition has a different type and we have to change the opcode), so deal
42769   // with those here.
42770   // FIXME: This is restricted to pre type legalization by ensuring the setcc
42771   // has i1 elements. If we loosen this we need to convert vector bool to a
42772   // scalar bool.
42773   if (Vec.getOpcode() == ISD::VSELECT &&
42774       Vec.getOperand(0).getOpcode() == ISD::SETCC &&
42775       Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
42776       Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
42777     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
42778     SDLoc DL(ExtElt);
42779     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
42780                                Vec.getOperand(0).getValueType().getScalarType(),
42781                                Vec.getOperand(0), Index);
42782     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
42783                                Vec.getOperand(1), Index);
42784     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
42785                                Vec.getOperand(2), Index);
42786     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
42787   }
42788 
42789   // TODO: This switch could include FNEG and the x86-specific FP logic ops
42790   // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
42791   // missed load folding and fma+fneg combining.
42792   switch (Vec.getOpcode()) {
42793   case ISD::FMA: // Begin 3 operands
42794   case ISD::FMAD:
42795   case ISD::FADD: // Begin 2 operands
42796   case ISD::FSUB:
42797   case ISD::FMUL:
42798   case ISD::FDIV:
42799   case ISD::FREM:
42800   case ISD::FCOPYSIGN:
42801   case ISD::FMINNUM:
42802   case ISD::FMAXNUM:
42803   case ISD::FMINNUM_IEEE:
42804   case ISD::FMAXNUM_IEEE:
42805   case ISD::FMAXIMUM:
42806   case ISD::FMINIMUM:
42807   case X86ISD::FMAX:
42808   case X86ISD::FMIN:
42809   case ISD::FABS: // Begin 1 operand
42810   case ISD::FSQRT:
42811   case ISD::FRINT:
42812   case ISD::FCEIL:
42813   case ISD::FTRUNC:
42814   case ISD::FNEARBYINT:
42815   case ISD::FROUND:
42816   case ISD::FFLOOR:
42817   case X86ISD::FRCP:
42818   case X86ISD::FRSQRT: {
42819     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
42820     SDLoc DL(ExtElt);
42821     SmallVector<SDValue, 4> ExtOps;
42822     for (SDValue Op : Vec->ops())
42823       ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
42824     return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
42825   }
42826   default:
42827     return SDValue();
42828   }
42829   llvm_unreachable("All opcodes should return within switch");
42830 }
42831 
42832 /// Try to convert a vector reduction sequence composed of binops and shuffles
42833 /// into horizontal ops.
42834 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
42835                                      const X86Subtarget &Subtarget) {
42836   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
42837 
42838   // We need at least SSE2 to anything here.
42839   if (!Subtarget.hasSSE2())
42840     return SDValue();
42841 
42842   ISD::NodeType Opc;
42843   SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
42844                                         {ISD::ADD, ISD::MUL, ISD::FADD}, true);
42845   if (!Rdx)
42846     return SDValue();
42847 
42848   SDValue Index = ExtElt->getOperand(1);
42849   assert(isNullConstant(Index) &&
42850          "Reduction doesn't end in an extract from index 0");
42851 
42852   EVT VT = ExtElt->getValueType(0);
42853   EVT VecVT = Rdx.getValueType();
42854   if (VecVT.getScalarType() != VT)
42855     return SDValue();
42856 
42857   SDLoc DL(ExtElt);
42858 
42859   // vXi8 mul reduction - promote to vXi16 mul reduction.
42860   if (Opc == ISD::MUL) {
42861     unsigned NumElts = VecVT.getVectorNumElements();
42862     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
42863       return SDValue();
42864     if (VecVT.getSizeInBits() >= 128) {
42865       EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
42866       SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
42867       SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
42868       Lo = DAG.getBitcast(WideVT, Lo);
42869       Hi = DAG.getBitcast(WideVT, Hi);
42870       Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
42871       while (Rdx.getValueSizeInBits() > 128) {
42872         std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
42873         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
42874       }
42875     } else {
42876       if (VecVT == MVT::v4i8)
42877         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
42878                           DAG.getUNDEF(MVT::v4i8));
42879       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
42880                         DAG.getUNDEF(MVT::v8i8));
42881       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
42882       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
42883     }
42884     if (NumElts >= 8)
42885       Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
42886                         DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
42887                                              {4, 5, 6, 7, -1, -1, -1, -1}));
42888     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
42889                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
42890                                            {2, 3, -1, -1, -1, -1, -1, -1}));
42891     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
42892                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
42893                                            {1, -1, -1, -1, -1, -1, -1, -1}));
42894     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
42895     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
42896   }
42897 
42898   // vXi8 add reduction - sub 128-bit vector.
42899   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
42900     if (VecVT == MVT::v4i8) {
42901       // Pad with zero.
42902       if (Subtarget.hasSSE41()) {
42903         Rdx = DAG.getBitcast(MVT::i32, Rdx);
42904         Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
42905                           DAG.getConstant(0, DL, MVT::v4i32), Rdx,
42906                           DAG.getIntPtrConstant(0, DL));
42907         Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
42908       } else {
42909         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
42910                           DAG.getConstant(0, DL, VecVT));
42911       }
42912     }
42913     if (Rdx.getValueType() == MVT::v8i8) {
42914       // Pad with undef.
42915       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
42916                         DAG.getUNDEF(MVT::v8i8));
42917     }
42918     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
42919                       DAG.getConstant(0, DL, MVT::v16i8));
42920     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
42921     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
42922   }
42923 
42924   // Must be a >=128-bit vector with pow2 elements.
42925   if ((VecVT.getSizeInBits() % 128) != 0 ||
42926       !isPowerOf2_32(VecVT.getVectorNumElements()))
42927     return SDValue();
42928 
42929   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
42930   if (VT == MVT::i8) {
42931     while (Rdx.getValueSizeInBits() > 128) {
42932       SDValue Lo, Hi;
42933       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
42934       VecVT = Lo.getValueType();
42935       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
42936     }
42937     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
42938 
42939     SDValue Hi = DAG.getVectorShuffle(
42940         MVT::v16i8, DL, Rdx, Rdx,
42941         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
42942     Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
42943     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
42944                       getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
42945     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
42946     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
42947   }
42948 
42949   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
42950   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
42951     return SDValue();
42952 
42953   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
42954 
42955   // 256-bit horizontal instructions operate on 128-bit chunks rather than
42956   // across the whole vector, so we need an extract + hop preliminary stage.
42957   // This is the only step where the operands of the hop are not the same value.
42958   // TODO: We could extend this to handle 512-bit or even longer vectors.
42959   if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
42960       ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
42961     unsigned NumElts = VecVT.getVectorNumElements();
42962     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
42963     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
42964     Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
42965     VecVT = Rdx.getValueType();
42966   }
42967   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
42968       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
42969     return SDValue();
42970 
42971   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
42972   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
42973   for (unsigned i = 0; i != ReductionSteps; ++i)
42974     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
42975 
42976   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
42977 }
42978 
42979 /// Detect vector gather/scatter index generation and convert it from being a
42980 /// bunch of shuffles and extracts into a somewhat faster sequence.
42981 /// For i686, the best sequence is apparently storing the value and loading
42982 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
42983 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
42984                                        TargetLowering::DAGCombinerInfo &DCI,
42985                                        const X86Subtarget &Subtarget) {
42986   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
42987     return NewOp;
42988 
42989   SDValue InputVector = N->getOperand(0);
42990   SDValue EltIdx = N->getOperand(1);
42991   auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
42992 
42993   EVT SrcVT = InputVector.getValueType();
42994   EVT VT = N->getValueType(0);
42995   SDLoc dl(InputVector);
42996   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
42997   unsigned NumSrcElts = SrcVT.getVectorNumElements();
42998 
42999   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
43000     return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43001 
43002   // Integer Constant Folding.
43003   if (CIdx && VT.isInteger()) {
43004     APInt UndefVecElts;
43005     SmallVector<APInt, 16> EltBits;
43006     unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
43007     if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
43008                                       EltBits, true, false)) {
43009       uint64_t Idx = CIdx->getZExtValue();
43010       if (UndefVecElts[Idx])
43011         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43012       return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
43013                              dl, VT);
43014     }
43015   }
43016 
43017   if (IsPextr) {
43018     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43019     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43020                                  APInt::getAllOnes(VT.getSizeInBits()), DCI))
43021       return SDValue(N, 0);
43022 
43023     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
43024     if ((InputVector.getOpcode() == X86ISD::PINSRB ||
43025          InputVector.getOpcode() == X86ISD::PINSRW) &&
43026         InputVector.getOperand(2) == EltIdx) {
43027       assert(SrcVT == InputVector.getOperand(0).getValueType() &&
43028              "Vector type mismatch");
43029       SDValue Scl = InputVector.getOperand(1);
43030       Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
43031       return DAG.getZExtOrTrunc(Scl, dl, VT);
43032     }
43033 
43034     // TODO - Remove this once we can handle the implicit zero-extension of
43035     // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
43036     // combineBasicSADPattern.
43037     return SDValue();
43038   }
43039 
43040   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
43041   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
43042       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
43043     SDValue MMXSrc = InputVector.getOperand(0);
43044 
43045     // The bitcast source is a direct mmx result.
43046     if (MMXSrc.getValueType() == MVT::x86mmx)
43047       return DAG.getBitcast(VT, InputVector);
43048   }
43049 
43050   // Detect mmx to i32 conversion through a v2i32 elt extract.
43051   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
43052       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
43053     SDValue MMXSrc = InputVector.getOperand(0);
43054 
43055     // The bitcast source is a direct mmx result.
43056     if (MMXSrc.getValueType() == MVT::x86mmx)
43057       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
43058   }
43059 
43060   // Check whether this extract is the root of a sum of absolute differences
43061   // pattern. This has to be done here because we really want it to happen
43062   // pre-legalization,
43063   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
43064     return SAD;
43065 
43066   if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
43067     return VPDPBUSD;
43068 
43069   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
43070   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
43071     return Cmp;
43072 
43073   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
43074   if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
43075     return MinMax;
43076 
43077   // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
43078   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
43079     return V;
43080 
43081   if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
43082     return V;
43083 
43084   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
43085   // and then testing the relevant element.
43086   //
43087   // Note that we only combine extracts on the *same* result number, i.e.
43088   //   t0 = merge_values a0, a1, a2, a3
43089   //   i1 = extract_vector_elt t0, Constant:i64<2>
43090   //   i1 = extract_vector_elt t0, Constant:i64<3>
43091   // but not
43092   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
43093   // since the latter would need its own MOVMSK.
43094   if (CIdx && SrcVT.getScalarType() == MVT::i1) {
43095     SmallVector<SDNode *, 16> BoolExtracts;
43096     unsigned ResNo = InputVector.getResNo();
43097     auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
43098       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
43099           isa<ConstantSDNode>(Use->getOperand(1)) &&
43100           Use->getOperand(0).getResNo() == ResNo &&
43101           Use->getValueType(0) == MVT::i1) {
43102         BoolExtracts.push_back(Use);
43103         return true;
43104       }
43105       return false;
43106     };
43107     if (all_of(InputVector->uses(), IsBoolExtract) &&
43108         BoolExtracts.size() > 1) {
43109       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
43110       if (SDValue BC =
43111               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
43112         for (SDNode *Use : BoolExtracts) {
43113           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
43114           unsigned MaskIdx = Use->getConstantOperandVal(1);
43115           APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
43116           SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
43117           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
43118           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
43119           DCI.CombineTo(Use, Res);
43120         }
43121         return SDValue(N, 0);
43122       }
43123     }
43124   }
43125 
43126   return SDValue();
43127 }
43128 
43129 /// If a vector select has an operand that is -1 or 0, try to simplify the
43130 /// select to a bitwise logic operation.
43131 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
43132 static SDValue
43133 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
43134                                  TargetLowering::DAGCombinerInfo &DCI,
43135                                  const X86Subtarget &Subtarget) {
43136   SDValue Cond = N->getOperand(0);
43137   SDValue LHS = N->getOperand(1);
43138   SDValue RHS = N->getOperand(2);
43139   EVT VT = LHS.getValueType();
43140   EVT CondVT = Cond.getValueType();
43141   SDLoc DL(N);
43142   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43143 
43144   if (N->getOpcode() != ISD::VSELECT)
43145     return SDValue();
43146 
43147   assert(CondVT.isVector() && "Vector select expects a vector selector!");
43148 
43149   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
43150   // TODO: Can we assert that both operands are not zeros (because that should
43151   //       get simplified at node creation time)?
43152   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
43153   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
43154 
43155   // If both inputs are 0/undef, create a complete zero vector.
43156   // FIXME: As noted above this should be handled by DAGCombiner/getNode.
43157   if (TValIsAllZeros && FValIsAllZeros) {
43158     if (VT.isFloatingPoint())
43159       return DAG.getConstantFP(0.0, DL, VT);
43160     return DAG.getConstant(0, DL, VT);
43161   }
43162 
43163   // To use the condition operand as a bitwise mask, it must have elements that
43164   // are the same size as the select elements. Ie, the condition operand must
43165   // have already been promoted from the IR select condition type <N x i1>.
43166   // Don't check if the types themselves are equal because that excludes
43167   // vector floating-point selects.
43168   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
43169     return SDValue();
43170 
43171   // Try to invert the condition if true value is not all 1s and false value is
43172   // not all 0s. Only do this if the condition has one use.
43173   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
43174   if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
43175       // Check if the selector will be produced by CMPP*/PCMP*.
43176       Cond.getOpcode() == ISD::SETCC &&
43177       // Check if SETCC has already been promoted.
43178       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
43179           CondVT) {
43180     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
43181 
43182     if (TValIsAllZeros || FValIsAllOnes) {
43183       SDValue CC = Cond.getOperand(2);
43184       ISD::CondCode NewCC = ISD::getSetCCInverse(
43185           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
43186       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
43187                           NewCC);
43188       std::swap(LHS, RHS);
43189       TValIsAllOnes = FValIsAllOnes;
43190       FValIsAllZeros = TValIsAllZeros;
43191     }
43192   }
43193 
43194   // Cond value must be 'sign splat' to be converted to a logical op.
43195   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
43196     return SDValue();
43197 
43198   // vselect Cond, 111..., 000... -> Cond
43199   if (TValIsAllOnes && FValIsAllZeros)
43200     return DAG.getBitcast(VT, Cond);
43201 
43202   if (!TLI.isTypeLegal(CondVT))
43203     return SDValue();
43204 
43205   // vselect Cond, 111..., X -> or Cond, X
43206   if (TValIsAllOnes) {
43207     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
43208     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
43209     return DAG.getBitcast(VT, Or);
43210   }
43211 
43212   // vselect Cond, X, 000... -> and Cond, X
43213   if (FValIsAllZeros) {
43214     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
43215     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
43216     return DAG.getBitcast(VT, And);
43217   }
43218 
43219   // vselect Cond, 000..., X -> andn Cond, X
43220   if (TValIsAllZeros) {
43221     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
43222     SDValue AndN;
43223     // The canonical form differs for i1 vectors - x86andnp is not used
43224     if (CondVT.getScalarType() == MVT::i1)
43225       AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
43226                          CastRHS);
43227     else
43228       AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
43229     return DAG.getBitcast(VT, AndN);
43230   }
43231 
43232   return SDValue();
43233 }
43234 
43235 /// If both arms of a vector select are concatenated vectors, split the select,
43236 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
43237 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
43238 ///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
43239 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
43240                                   const X86Subtarget &Subtarget) {
43241   unsigned Opcode = N->getOpcode();
43242   if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
43243     return SDValue();
43244 
43245   // TODO: Split 512-bit vectors too?
43246   EVT VT = N->getValueType(0);
43247   if (!VT.is256BitVector())
43248     return SDValue();
43249 
43250   // TODO: Split as long as any 2 of the 3 operands are concatenated?
43251   SDValue Cond = N->getOperand(0);
43252   SDValue TVal = N->getOperand(1);
43253   SDValue FVal = N->getOperand(2);
43254   SmallVector<SDValue, 4> CatOpsT, CatOpsF;
43255   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
43256       !collectConcatOps(TVal.getNode(), CatOpsT) ||
43257       !collectConcatOps(FVal.getNode(), CatOpsF))
43258     return SDValue();
43259 
43260   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
43261                             ArrayRef<SDValue> Ops) {
43262     return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
43263   };
43264   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
43265                           makeBlend, /*CheckBWI*/ false);
43266 }
43267 
43268 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
43269   SDValue Cond = N->getOperand(0);
43270   SDValue LHS = N->getOperand(1);
43271   SDValue RHS = N->getOperand(2);
43272   SDLoc DL(N);
43273 
43274   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
43275   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
43276   if (!TrueC || !FalseC)
43277     return SDValue();
43278 
43279   // Don't do this for crazy integer types.
43280   EVT VT = N->getValueType(0);
43281   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
43282     return SDValue();
43283 
43284   // We're going to use the condition bit in math or logic ops. We could allow
43285   // this with a wider condition value (post-legalization it becomes an i8),
43286   // but if nothing is creating selects that late, it doesn't matter.
43287   if (Cond.getValueType() != MVT::i1)
43288     return SDValue();
43289 
43290   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
43291   // 3, 5, or 9 with i32/i64, so those get transformed too.
43292   // TODO: For constants that overflow or do not differ by power-of-2 or small
43293   // multiplier, convert to 'and' + 'add'.
43294   const APInt &TrueVal = TrueC->getAPIntValue();
43295   const APInt &FalseVal = FalseC->getAPIntValue();
43296 
43297   // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
43298   if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
43299       Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
43300     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
43301     if (CC == ISD::SETEQ || CC == ISD::SETNE)
43302       return SDValue();
43303   }
43304 
43305   bool OV;
43306   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
43307   if (OV)
43308     return SDValue();
43309 
43310   APInt AbsDiff = Diff.abs();
43311   if (AbsDiff.isPowerOf2() ||
43312       ((VT == MVT::i32 || VT == MVT::i64) &&
43313        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
43314 
43315     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
43316     // of the condition can usually be folded into a compare predicate, but even
43317     // without that, the sequence should be cheaper than a CMOV alternative.
43318     if (TrueVal.slt(FalseVal)) {
43319       Cond = DAG.getNOT(DL, Cond, MVT::i1);
43320       std::swap(TrueC, FalseC);
43321     }
43322 
43323     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
43324     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
43325 
43326     // Multiply condition by the difference if non-one.
43327     if (!AbsDiff.isOne())
43328       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
43329 
43330     // Add the base if non-zero.
43331     if (!FalseC->isZero())
43332       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
43333 
43334     return R;
43335   }
43336 
43337   return SDValue();
43338 }
43339 
43340 /// If this is a *dynamic* select (non-constant condition) and we can match
43341 /// this node with one of the variable blend instructions, restructure the
43342 /// condition so that blends can use the high (sign) bit of each element.
43343 /// This function will also call SimplifyDemandedBits on already created
43344 /// BLENDV to perform additional simplifications.
43345 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
43346                                            TargetLowering::DAGCombinerInfo &DCI,
43347                                            const X86Subtarget &Subtarget) {
43348   SDValue Cond = N->getOperand(0);
43349   if ((N->getOpcode() != ISD::VSELECT &&
43350        N->getOpcode() != X86ISD::BLENDV) ||
43351       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
43352     return SDValue();
43353 
43354   // Don't optimize before the condition has been transformed to a legal type
43355   // and don't ever optimize vector selects that map to AVX512 mask-registers.
43356   unsigned BitWidth = Cond.getScalarValueSizeInBits();
43357   if (BitWidth < 8 || BitWidth > 64)
43358     return SDValue();
43359 
43360   // We can only handle the cases where VSELECT is directly legal on the
43361   // subtarget. We custom lower VSELECT nodes with constant conditions and
43362   // this makes it hard to see whether a dynamic VSELECT will correctly
43363   // lower, so we both check the operation's status and explicitly handle the
43364   // cases where a *dynamic* blend will fail even though a constant-condition
43365   // blend could be custom lowered.
43366   // FIXME: We should find a better way to handle this class of problems.
43367   // Potentially, we should combine constant-condition vselect nodes
43368   // pre-legalization into shuffles and not mark as many types as custom
43369   // lowered.
43370   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43371   EVT VT = N->getValueType(0);
43372   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
43373     return SDValue();
43374   // FIXME: We don't support i16-element blends currently. We could and
43375   // should support them by making *all* the bits in the condition be set
43376   // rather than just the high bit and using an i8-element blend.
43377   if (VT.getVectorElementType() == MVT::i16)
43378     return SDValue();
43379   // Dynamic blending was only available from SSE4.1 onward.
43380   if (VT.is128BitVector() && !Subtarget.hasSSE41())
43381     return SDValue();
43382   // Byte blends are only available in AVX2
43383   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
43384     return SDValue();
43385   // There are no 512-bit blend instructions that use sign bits.
43386   if (VT.is512BitVector())
43387     return SDValue();
43388 
43389   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
43390     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
43391          UI != UE; ++UI)
43392       if ((UI->getOpcode() != ISD::VSELECT &&
43393            UI->getOpcode() != X86ISD::BLENDV) ||
43394           UI.getOperandNo() != 0)
43395         return false;
43396 
43397     return true;
43398   };
43399 
43400   APInt DemandedBits(APInt::getSignMask(BitWidth));
43401 
43402   if (OnlyUsedAsSelectCond(Cond)) {
43403     KnownBits Known;
43404     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
43405                                           !DCI.isBeforeLegalizeOps());
43406     if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
43407       return SDValue();
43408 
43409     // If we changed the computation somewhere in the DAG, this change will
43410     // affect all users of Cond. Update all the nodes so that we do not use
43411     // the generic VSELECT anymore. Otherwise, we may perform wrong
43412     // optimizations as we messed with the actual expectation for the vector
43413     // boolean values.
43414     for (SDNode *U : Cond->uses()) {
43415       if (U->getOpcode() == X86ISD::BLENDV)
43416         continue;
43417 
43418       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
43419                                Cond, U->getOperand(1), U->getOperand(2));
43420       DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
43421       DCI.AddToWorklist(U);
43422     }
43423     DCI.CommitTargetLoweringOpt(TLO);
43424     return SDValue(N, 0);
43425   }
43426 
43427   // Otherwise we can still at least try to simplify multiple use bits.
43428   if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
43429       return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
43430                          N->getOperand(1), N->getOperand(2));
43431 
43432   return SDValue();
43433 }
43434 
43435 // Try to match:
43436 //   (or (and (M, (sub 0, X)), (pandn M, X)))
43437 // which is a special case of:
43438 //   (select M, (sub 0, X), X)
43439 // Per:
43440 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
43441 // We know that, if fNegate is 0 or 1:
43442 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
43443 //
43444 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
43445 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
43446 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
43447 // This lets us transform our vselect to:
43448 //   (add (xor X, M), (and M, 1))
43449 // And further to:
43450 //   (sub (xor X, M), M)
43451 static SDValue combineLogicBlendIntoConditionalNegate(
43452     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
43453     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
43454   EVT MaskVT = Mask.getValueType();
43455   assert(MaskVT.isInteger() &&
43456          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
43457          "Mask must be zero/all-bits");
43458 
43459   if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
43460     return SDValue();
43461   if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
43462     return SDValue();
43463 
43464   auto IsNegV = [](SDNode *N, SDValue V) {
43465     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
43466            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
43467   };
43468 
43469   SDValue V;
43470   if (IsNegV(Y.getNode(), X))
43471     V = X;
43472   else if (IsNegV(X.getNode(), Y))
43473     V = Y;
43474   else
43475     return SDValue();
43476 
43477   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
43478   SDValue SubOp2 = Mask;
43479 
43480   // If the negate was on the false side of the select, then
43481   // the operands of the SUB need to be swapped. PR 27251.
43482   // This is because the pattern being matched above is
43483   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
43484   // but if the pattern matched was
43485   // (vselect M, X, (sub (0, X))), that is really negation of the pattern
43486   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
43487   // pattern also needs to be a negation of the replacement pattern above.
43488   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
43489   // sub accomplishes the negation of the replacement pattern.
43490   if (V == Y)
43491     std::swap(SubOp1, SubOp2);
43492 
43493   SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
43494   return DAG.getBitcast(VT, Res);
43495 }
43496 
43497 /// Do target-specific dag combines on SELECT and VSELECT nodes.
43498 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
43499                              TargetLowering::DAGCombinerInfo &DCI,
43500                              const X86Subtarget &Subtarget) {
43501   SDLoc DL(N);
43502   SDValue Cond = N->getOperand(0);
43503   SDValue LHS = N->getOperand(1);
43504   SDValue RHS = N->getOperand(2);
43505 
43506   // Try simplification again because we use this function to optimize
43507   // BLENDV nodes that are not handled by the generic combiner.
43508   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
43509     return V;
43510 
43511   EVT VT = LHS.getValueType();
43512   EVT CondVT = Cond.getValueType();
43513   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43514   bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
43515 
43516   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
43517   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
43518   // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
43519   if (CondVT.isVector() && CondVT.isInteger() &&
43520       CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
43521       (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
43522       DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
43523     if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
43524                                                            DL, DAG, Subtarget))
43525       return V;
43526 
43527   // Convert vselects with constant condition into shuffles.
43528   if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
43529     SmallVector<int, 64> Mask;
43530     if (createShuffleMaskFromVSELECT(Mask, Cond))
43531       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
43532   }
43533 
43534   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
43535   // by forcing the unselected elements to zero.
43536   // TODO: Can we handle more shuffles with this?
43537   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
43538       LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
43539       LHS.hasOneUse() && RHS.hasOneUse()) {
43540     MVT SimpleVT = VT.getSimpleVT();
43541     SmallVector<SDValue, 1> LHSOps, RHSOps;
43542     SmallVector<int, 64> LHSMask, RHSMask, CondMask;
43543     if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
43544         getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
43545         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
43546       int NumElts = VT.getVectorNumElements();
43547       for (int i = 0; i != NumElts; ++i) {
43548         // getConstVector sets negative shuffle mask values as undef, so ensure
43549         // we hardcode SM_SentinelZero values to zero (0x80).
43550         if (CondMask[i] < NumElts) {
43551           LHSMask[i] = (LHSMask[i] == SM_SentinelZero) ? 0x80 : LHSMask[i];
43552           RHSMask[i] = 0x80;
43553         } else {
43554           LHSMask[i] = 0x80;
43555           RHSMask[i] = (RHSMask[i] == SM_SentinelZero) ? 0x80 : RHSMask[i];
43556         }
43557       }
43558       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
43559                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
43560       RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
43561                         getConstVector(RHSMask, SimpleVT, DAG, DL, true));
43562       return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
43563     }
43564   }
43565 
43566   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
43567   // instructions match the semantics of the common C idiom x<y?x:y but not
43568   // x<=y?x:y, because of how they handle negative zero (which can be
43569   // ignored in unsafe-math mode).
43570   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
43571   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
43572       VT != MVT::f80 && VT != MVT::f128 &&
43573       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
43574       (Subtarget.hasSSE2() ||
43575        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
43576     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
43577 
43578     unsigned Opcode = 0;
43579     // Check for x CC y ? x : y.
43580     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
43581         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
43582       switch (CC) {
43583       default: break;
43584       case ISD::SETULT:
43585         // Converting this to a min would handle NaNs incorrectly, and swapping
43586         // the operands would cause it to handle comparisons between positive
43587         // and negative zero incorrectly.
43588         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
43589           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
43590               !(DAG.isKnownNeverZeroFloat(LHS) ||
43591                 DAG.isKnownNeverZeroFloat(RHS)))
43592             break;
43593           std::swap(LHS, RHS);
43594         }
43595         Opcode = X86ISD::FMIN;
43596         break;
43597       case ISD::SETOLE:
43598         // Converting this to a min would handle comparisons between positive
43599         // and negative zero incorrectly.
43600         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
43601             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
43602           break;
43603         Opcode = X86ISD::FMIN;
43604         break;
43605       case ISD::SETULE:
43606         // Converting this to a min would handle both negative zeros and NaNs
43607         // incorrectly, but we can swap the operands to fix both.
43608         std::swap(LHS, RHS);
43609         LLVM_FALLTHROUGH;
43610       case ISD::SETOLT:
43611       case ISD::SETLT:
43612       case ISD::SETLE:
43613         Opcode = X86ISD::FMIN;
43614         break;
43615 
43616       case ISD::SETOGE:
43617         // Converting this to a max would handle comparisons between positive
43618         // and negative zero incorrectly.
43619         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
43620             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
43621           break;
43622         Opcode = X86ISD::FMAX;
43623         break;
43624       case ISD::SETUGT:
43625         // Converting this to a max would handle NaNs incorrectly, and swapping
43626         // the operands would cause it to handle comparisons between positive
43627         // and negative zero incorrectly.
43628         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
43629           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
43630               !(DAG.isKnownNeverZeroFloat(LHS) ||
43631                 DAG.isKnownNeverZeroFloat(RHS)))
43632             break;
43633           std::swap(LHS, RHS);
43634         }
43635         Opcode = X86ISD::FMAX;
43636         break;
43637       case ISD::SETUGE:
43638         // Converting this to a max would handle both negative zeros and NaNs
43639         // incorrectly, but we can swap the operands to fix both.
43640         std::swap(LHS, RHS);
43641         LLVM_FALLTHROUGH;
43642       case ISD::SETOGT:
43643       case ISD::SETGT:
43644       case ISD::SETGE:
43645         Opcode = X86ISD::FMAX;
43646         break;
43647       }
43648     // Check for x CC y ? y : x -- a min/max with reversed arms.
43649     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
43650                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
43651       switch (CC) {
43652       default: break;
43653       case ISD::SETOGE:
43654         // Converting this to a min would handle comparisons between positive
43655         // and negative zero incorrectly, and swapping the operands would
43656         // cause it to handle NaNs incorrectly.
43657         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
43658             !(DAG.isKnownNeverZeroFloat(LHS) ||
43659               DAG.isKnownNeverZeroFloat(RHS))) {
43660           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
43661             break;
43662           std::swap(LHS, RHS);
43663         }
43664         Opcode = X86ISD::FMIN;
43665         break;
43666       case ISD::SETUGT:
43667         // Converting this to a min would handle NaNs incorrectly.
43668         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
43669           break;
43670         Opcode = X86ISD::FMIN;
43671         break;
43672       case ISD::SETUGE:
43673         // Converting this to a min would handle both negative zeros and NaNs
43674         // incorrectly, but we can swap the operands to fix both.
43675         std::swap(LHS, RHS);
43676         LLVM_FALLTHROUGH;
43677       case ISD::SETOGT:
43678       case ISD::SETGT:
43679       case ISD::SETGE:
43680         Opcode = X86ISD::FMIN;
43681         break;
43682 
43683       case ISD::SETULT:
43684         // Converting this to a max would handle NaNs incorrectly.
43685         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
43686           break;
43687         Opcode = X86ISD::FMAX;
43688         break;
43689       case ISD::SETOLE:
43690         // Converting this to a max would handle comparisons between positive
43691         // and negative zero incorrectly, and swapping the operands would
43692         // cause it to handle NaNs incorrectly.
43693         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
43694             !DAG.isKnownNeverZeroFloat(LHS) &&
43695             !DAG.isKnownNeverZeroFloat(RHS)) {
43696           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
43697             break;
43698           std::swap(LHS, RHS);
43699         }
43700         Opcode = X86ISD::FMAX;
43701         break;
43702       case ISD::SETULE:
43703         // Converting this to a max would handle both negative zeros and NaNs
43704         // incorrectly, but we can swap the operands to fix both.
43705         std::swap(LHS, RHS);
43706         LLVM_FALLTHROUGH;
43707       case ISD::SETOLT:
43708       case ISD::SETLT:
43709       case ISD::SETLE:
43710         Opcode = X86ISD::FMAX;
43711         break;
43712       }
43713     }
43714 
43715     if (Opcode)
43716       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
43717   }
43718 
43719   // Some mask scalar intrinsics rely on checking if only one bit is set
43720   // and implement it in C code like this:
43721   // A[0] = (U & 1) ? A[0] : W[0];
43722   // This creates some redundant instructions that break pattern matching.
43723   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
43724   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
43725       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
43726     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
43727     SDValue AndNode = Cond.getOperand(0);
43728     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
43729         isNullConstant(Cond.getOperand(1)) &&
43730         isOneConstant(AndNode.getOperand(1))) {
43731       // LHS and RHS swapped due to
43732       // setcc outputting 1 when AND resulted in 0 and vice versa.
43733       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
43734       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
43735     }
43736   }
43737 
43738   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
43739   // lowering on KNL. In this case we convert it to
43740   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
43741   // The same situation all vectors of i8 and i16 without BWI.
43742   // Make sure we extend these even before type legalization gets a chance to
43743   // split wide vectors.
43744   // Since SKX these selects have a proper lowering.
43745   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
43746       CondVT.getVectorElementType() == MVT::i1 &&
43747       (VT.getVectorElementType() == MVT::i8 ||
43748        VT.getVectorElementType() == MVT::i16)) {
43749     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
43750     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
43751   }
43752 
43753   // AVX512 - Extend select with zero to merge with target shuffle.
43754   // select(mask, extract_subvector(shuffle(x)), zero) -->
43755   // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
43756   // TODO - support non target shuffles as well.
43757   if (Subtarget.hasAVX512() && CondVT.isVector() &&
43758       CondVT.getVectorElementType() == MVT::i1) {
43759     auto SelectableOp = [&TLI](SDValue Op) {
43760       return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43761              isTargetShuffle(Op.getOperand(0).getOpcode()) &&
43762              isNullConstant(Op.getOperand(1)) &&
43763              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
43764              Op.hasOneUse() && Op.getOperand(0).hasOneUse();
43765     };
43766 
43767     bool SelectableLHS = SelectableOp(LHS);
43768     bool SelectableRHS = SelectableOp(RHS);
43769     bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
43770     bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
43771 
43772     if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
43773       EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
43774                                 : RHS.getOperand(0).getValueType();
43775       EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
43776       LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
43777                             VT.getSizeInBits());
43778       RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
43779                             VT.getSizeInBits());
43780       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
43781                          DAG.getUNDEF(SrcCondVT), Cond,
43782                          DAG.getIntPtrConstant(0, DL));
43783       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
43784       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
43785     }
43786   }
43787 
43788   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
43789     return V;
43790 
43791   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
43792       Cond.hasOneUse()) {
43793     EVT CondVT = Cond.getValueType();
43794     SDValue Cond0 = Cond.getOperand(0);
43795     SDValue Cond1 = Cond.getOperand(1);
43796     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
43797 
43798     // Canonicalize min/max:
43799     // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
43800     // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
43801     // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
43802     // the need for an extra compare against zero. e.g.
43803     // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
43804     // subl   %esi, %edi
43805     // testl  %edi, %edi
43806     // movl   $0, %eax
43807     // cmovgl %edi, %eax
43808     // =>
43809     // xorl   %eax, %eax
43810     // subl   %esi, $edi
43811     // cmovsl %eax, %edi
43812     //
43813     // We can also canonicalize
43814     //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
43815     //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
43816     // This allows the use of a test instruction for the compare.
43817     if (LHS == Cond0 && RHS == Cond1) {
43818       if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
43819           (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
43820         ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
43821         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
43822         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
43823       }
43824       if (CC == ISD::SETUGT && isOneConstant(RHS)) {
43825         ISD::CondCode NewCC = ISD::SETUGE;
43826         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
43827         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
43828       }
43829     }
43830 
43831     // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
43832     // fold eq + gt/lt nested selects into ge/le selects
43833     // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
43834     // --> (select (cmpuge Cond0, Cond1), LHS, Y)
43835     // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
43836     // --> (select (cmpsle Cond0, Cond1), LHS, Y)
43837     // .. etc ..
43838     if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
43839         RHS.getOperand(0).getOpcode() == ISD::SETCC) {
43840       SDValue InnerSetCC = RHS.getOperand(0);
43841       ISD::CondCode InnerCC =
43842           cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
43843       if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
43844           Cond0 == InnerSetCC.getOperand(0) &&
43845           Cond1 == InnerSetCC.getOperand(1)) {
43846         ISD::CondCode NewCC;
43847         switch (CC == ISD::SETEQ ? InnerCC : CC) {
43848         case ISD::SETGT:  NewCC = ISD::SETGE; break;
43849         case ISD::SETLT:  NewCC = ISD::SETLE; break;
43850         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
43851         case ISD::SETULT: NewCC = ISD::SETULE; break;
43852         default: NewCC = ISD::SETCC_INVALID; break;
43853         }
43854         if (NewCC != ISD::SETCC_INVALID) {
43855           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
43856           return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
43857         }
43858       }
43859     }
43860   }
43861 
43862   // Check if the first operand is all zeros and Cond type is vXi1.
43863   // If this an avx512 target we can improve the use of zero masking by
43864   // swapping the operands and inverting the condition.
43865   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
43866        Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
43867       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
43868       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
43869     // Invert the cond to not(cond) : xor(op,allones)=not(op)
43870     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
43871     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
43872     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
43873   }
43874 
43875   // Early exit check
43876   if (!TLI.isTypeLegal(VT))
43877     return SDValue();
43878 
43879   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
43880     return V;
43881 
43882   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
43883     return V;
43884 
43885   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
43886     return V;
43887 
43888   // select(~Cond, X, Y) -> select(Cond, Y, X)
43889   if (CondVT.getScalarType() != MVT::i1) {
43890     if (SDValue CondNot = IsNOT(Cond, DAG))
43891       return DAG.getNode(N->getOpcode(), DL, VT,
43892                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
43893     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
43894     if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
43895         ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
43896       Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
43897                          DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
43898       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
43899     }
43900   }
43901 
43902   // Try to optimize vXi1 selects if both operands are either all constants or
43903   // bitcasts from scalar integer type. In that case we can convert the operands
43904   // to integer and use an integer select which will be converted to a CMOV.
43905   // We need to take a little bit of care to avoid creating an i64 type after
43906   // type legalization.
43907   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
43908       VT.getVectorElementType() == MVT::i1 &&
43909       (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
43910     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
43911     bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
43912     bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
43913 
43914     if ((LHSIsConst ||
43915          (LHS.getOpcode() == ISD::BITCAST &&
43916           LHS.getOperand(0).getValueType() == IntVT)) &&
43917         (RHSIsConst ||
43918          (RHS.getOpcode() == ISD::BITCAST &&
43919           RHS.getOperand(0).getValueType() == IntVT))) {
43920       if (LHSIsConst)
43921         LHS = combinevXi1ConstantToInteger(LHS, DAG);
43922       else
43923         LHS = LHS.getOperand(0);
43924 
43925       if (RHSIsConst)
43926         RHS = combinevXi1ConstantToInteger(RHS, DAG);
43927       else
43928         RHS = RHS.getOperand(0);
43929 
43930       SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
43931       return DAG.getBitcast(VT, Select);
43932     }
43933   }
43934 
43935   // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
43936   // single bits, then invert the predicate and swap the select operands.
43937   // This can lower using a vector shift bit-hack rather than mask and compare.
43938   if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
43939       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
43940       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
43941       Cond.getOperand(0).getOpcode() == ISD::AND &&
43942       isNullOrNullSplat(Cond.getOperand(1)) &&
43943       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
43944       Cond.getOperand(0).getValueType() == VT) {
43945     // The 'and' mask must be composed of power-of-2 constants.
43946     SDValue And = Cond.getOperand(0);
43947     auto *C = isConstOrConstSplat(And.getOperand(1));
43948     if (C && C->getAPIntValue().isPowerOf2()) {
43949       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
43950       SDValue NotCond =
43951           DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
43952       return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
43953     }
43954 
43955     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
43956     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
43957     // 16-bit lacks a proper blendv.
43958     unsigned EltBitWidth = VT.getScalarSizeInBits();
43959     bool CanShiftBlend =
43960         TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
43961                                 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
43962                                 (Subtarget.hasXOP()));
43963     if (CanShiftBlend &&
43964         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
43965           return C->getAPIntValue().isPowerOf2();
43966         })) {
43967       // Create a left-shift constant to get the mask bits over to the sign-bit.
43968       SDValue Mask = And.getOperand(1);
43969       SmallVector<int, 32> ShlVals;
43970       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
43971         auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
43972         ShlVals.push_back(EltBitWidth - 1 -
43973                           MaskVal->getAPIntValue().exactLogBase2());
43974       }
43975       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
43976       SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
43977       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
43978       SDValue NewCond =
43979           DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
43980       return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
43981     }
43982   }
43983 
43984   return SDValue();
43985 }
43986 
43987 /// Combine:
43988 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
43989 /// to:
43990 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
43991 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
43992 /// Note that this is only legal for some op/cc combinations.
43993 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
43994                                        SelectionDAG &DAG,
43995                                        const X86Subtarget &Subtarget) {
43996   // This combine only operates on CMP-like nodes.
43997   if (!(Cmp.getOpcode() == X86ISD::CMP ||
43998         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
43999     return SDValue();
44000 
44001   // Can't replace the cmp if it has more uses than the one we're looking at.
44002   // FIXME: We would like to be able to handle this, but would need to make sure
44003   // all uses were updated.
44004   if (!Cmp.hasOneUse())
44005     return SDValue();
44006 
44007   // This only applies to variations of the common case:
44008   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
44009   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
44010   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
44011   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
44012   // Using the proper condcodes (see below), overflow is checked for.
44013 
44014   // FIXME: We can generalize both constraints:
44015   // - XOR/OR/AND (if they were made to survive AtomicExpand)
44016   // - LHS != 1
44017   // if the result is compared.
44018 
44019   SDValue CmpLHS = Cmp.getOperand(0);
44020   SDValue CmpRHS = Cmp.getOperand(1);
44021   EVT CmpVT = CmpLHS.getValueType();
44022 
44023   if (!CmpLHS.hasOneUse())
44024     return SDValue();
44025 
44026   unsigned Opc = CmpLHS.getOpcode();
44027   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
44028     return SDValue();
44029 
44030   SDValue OpRHS = CmpLHS.getOperand(2);
44031   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
44032   if (!OpRHSC)
44033     return SDValue();
44034 
44035   APInt Addend = OpRHSC->getAPIntValue();
44036   if (Opc == ISD::ATOMIC_LOAD_SUB)
44037     Addend = -Addend;
44038 
44039   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
44040   if (!CmpRHSC)
44041     return SDValue();
44042 
44043   APInt Comparison = CmpRHSC->getAPIntValue();
44044   APInt NegAddend = -Addend;
44045 
44046   // See if we can adjust the CC to make the comparison match the negated
44047   // addend.
44048   if (Comparison != NegAddend) {
44049     APInt IncComparison = Comparison + 1;
44050     if (IncComparison == NegAddend) {
44051       if (CC == X86::COND_A && !Comparison.isMaxValue()) {
44052         Comparison = IncComparison;
44053         CC = X86::COND_AE;
44054       } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
44055         Comparison = IncComparison;
44056         CC = X86::COND_L;
44057       }
44058     }
44059     APInt DecComparison = Comparison - 1;
44060     if (DecComparison == NegAddend) {
44061       if (CC == X86::COND_AE && !Comparison.isMinValue()) {
44062         Comparison = DecComparison;
44063         CC = X86::COND_A;
44064       } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
44065         Comparison = DecComparison;
44066         CC = X86::COND_LE;
44067       }
44068     }
44069   }
44070 
44071   // If the addend is the negation of the comparison value, then we can do
44072   // a full comparison by emitting the atomic arithmetic as a locked sub.
44073   if (Comparison == NegAddend) {
44074     // The CC is fine, but we need to rewrite the LHS of the comparison as an
44075     // atomic sub.
44076     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
44077     auto AtomicSub = DAG.getAtomic(
44078         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
44079         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
44080         /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
44081         AN->getMemOperand());
44082     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
44083     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
44084     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
44085     return LockOp;
44086   }
44087 
44088   // We can handle comparisons with zero in a number of cases by manipulating
44089   // the CC used.
44090   if (!Comparison.isZero())
44091     return SDValue();
44092 
44093   if (CC == X86::COND_S && Addend == 1)
44094     CC = X86::COND_LE;
44095   else if (CC == X86::COND_NS && Addend == 1)
44096     CC = X86::COND_G;
44097   else if (CC == X86::COND_G && Addend == -1)
44098     CC = X86::COND_GE;
44099   else if (CC == X86::COND_LE && Addend == -1)
44100     CC = X86::COND_L;
44101   else
44102     return SDValue();
44103 
44104   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
44105   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
44106   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
44107   return LockOp;
44108 }
44109 
44110 // Check whether a boolean test is testing a boolean value generated by
44111 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
44112 // code.
44113 //
44114 // Simplify the following patterns:
44115 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
44116 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
44117 // to (Op EFLAGS Cond)
44118 //
44119 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
44120 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
44121 // to (Op EFLAGS !Cond)
44122 //
44123 // where Op could be BRCOND or CMOV.
44124 //
44125 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
44126   // This combine only operates on CMP-like nodes.
44127   if (!(Cmp.getOpcode() == X86ISD::CMP ||
44128         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
44129     return SDValue();
44130 
44131   // Quit if not used as a boolean value.
44132   if (CC != X86::COND_E && CC != X86::COND_NE)
44133     return SDValue();
44134 
44135   // Check CMP operands. One of them should be 0 or 1 and the other should be
44136   // an SetCC or extended from it.
44137   SDValue Op1 = Cmp.getOperand(0);
44138   SDValue Op2 = Cmp.getOperand(1);
44139 
44140   SDValue SetCC;
44141   const ConstantSDNode* C = nullptr;
44142   bool needOppositeCond = (CC == X86::COND_E);
44143   bool checkAgainstTrue = false; // Is it a comparison against 1?
44144 
44145   if ((C = dyn_cast<ConstantSDNode>(Op1)))
44146     SetCC = Op2;
44147   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
44148     SetCC = Op1;
44149   else // Quit if all operands are not constants.
44150     return SDValue();
44151 
44152   if (C->getZExtValue() == 1) {
44153     needOppositeCond = !needOppositeCond;
44154     checkAgainstTrue = true;
44155   } else if (C->getZExtValue() != 0)
44156     // Quit if the constant is neither 0 or 1.
44157     return SDValue();
44158 
44159   bool truncatedToBoolWithAnd = false;
44160   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
44161   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
44162          SetCC.getOpcode() == ISD::TRUNCATE ||
44163          SetCC.getOpcode() == ISD::AND) {
44164     if (SetCC.getOpcode() == ISD::AND) {
44165       int OpIdx = -1;
44166       if (isOneConstant(SetCC.getOperand(0)))
44167         OpIdx = 1;
44168       if (isOneConstant(SetCC.getOperand(1)))
44169         OpIdx = 0;
44170       if (OpIdx < 0)
44171         break;
44172       SetCC = SetCC.getOperand(OpIdx);
44173       truncatedToBoolWithAnd = true;
44174     } else
44175       SetCC = SetCC.getOperand(0);
44176   }
44177 
44178   switch (SetCC.getOpcode()) {
44179   case X86ISD::SETCC_CARRY:
44180     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
44181     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
44182     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
44183     // truncated to i1 using 'and'.
44184     if (checkAgainstTrue && !truncatedToBoolWithAnd)
44185       break;
44186     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
44187            "Invalid use of SETCC_CARRY!");
44188     LLVM_FALLTHROUGH;
44189   case X86ISD::SETCC:
44190     // Set the condition code or opposite one if necessary.
44191     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
44192     if (needOppositeCond)
44193       CC = X86::GetOppositeBranchCondition(CC);
44194     return SetCC.getOperand(1);
44195   case X86ISD::CMOV: {
44196     // Check whether false/true value has canonical one, i.e. 0 or 1.
44197     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
44198     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
44199     // Quit if true value is not a constant.
44200     if (!TVal)
44201       return SDValue();
44202     // Quit if false value is not a constant.
44203     if (!FVal) {
44204       SDValue Op = SetCC.getOperand(0);
44205       // Skip 'zext' or 'trunc' node.
44206       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
44207           Op.getOpcode() == ISD::TRUNCATE)
44208         Op = Op.getOperand(0);
44209       // A special case for rdrand/rdseed, where 0 is set if false cond is
44210       // found.
44211       if ((Op.getOpcode() != X86ISD::RDRAND &&
44212            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
44213         return SDValue();
44214     }
44215     // Quit if false value is not the constant 0 or 1.
44216     bool FValIsFalse = true;
44217     if (FVal && FVal->getZExtValue() != 0) {
44218       if (FVal->getZExtValue() != 1)
44219         return SDValue();
44220       // If FVal is 1, opposite cond is needed.
44221       needOppositeCond = !needOppositeCond;
44222       FValIsFalse = false;
44223     }
44224     // Quit if TVal is not the constant opposite of FVal.
44225     if (FValIsFalse && TVal->getZExtValue() != 1)
44226       return SDValue();
44227     if (!FValIsFalse && TVal->getZExtValue() != 0)
44228       return SDValue();
44229     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
44230     if (needOppositeCond)
44231       CC = X86::GetOppositeBranchCondition(CC);
44232     return SetCC.getOperand(3);
44233   }
44234   }
44235 
44236   return SDValue();
44237 }
44238 
44239 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
44240 /// Match:
44241 ///   (X86or (X86setcc) (X86setcc))
44242 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
44243 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
44244                                            X86::CondCode &CC1, SDValue &Flags,
44245                                            bool &isAnd) {
44246   if (Cond->getOpcode() == X86ISD::CMP) {
44247     if (!isNullConstant(Cond->getOperand(1)))
44248       return false;
44249 
44250     Cond = Cond->getOperand(0);
44251   }
44252 
44253   isAnd = false;
44254 
44255   SDValue SetCC0, SetCC1;
44256   switch (Cond->getOpcode()) {
44257   default: return false;
44258   case ISD::AND:
44259   case X86ISD::AND:
44260     isAnd = true;
44261     LLVM_FALLTHROUGH;
44262   case ISD::OR:
44263   case X86ISD::OR:
44264     SetCC0 = Cond->getOperand(0);
44265     SetCC1 = Cond->getOperand(1);
44266     break;
44267   };
44268 
44269   // Make sure we have SETCC nodes, using the same flags value.
44270   if (SetCC0.getOpcode() != X86ISD::SETCC ||
44271       SetCC1.getOpcode() != X86ISD::SETCC ||
44272       SetCC0->getOperand(1) != SetCC1->getOperand(1))
44273     return false;
44274 
44275   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
44276   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
44277   Flags = SetCC0->getOperand(1);
44278   return true;
44279 }
44280 
44281 // When legalizing carry, we create carries via add X, -1
44282 // If that comes from an actual carry, via setcc, we use the
44283 // carry directly.
44284 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
44285   if (EFLAGS.getOpcode() == X86ISD::ADD) {
44286     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
44287       SDValue Carry = EFLAGS.getOperand(0);
44288       while (Carry.getOpcode() == ISD::TRUNCATE ||
44289              Carry.getOpcode() == ISD::ZERO_EXTEND ||
44290              Carry.getOpcode() == ISD::SIGN_EXTEND ||
44291              Carry.getOpcode() == ISD::ANY_EXTEND ||
44292              (Carry.getOpcode() == ISD::AND &&
44293               isOneConstant(Carry.getOperand(1))))
44294         Carry = Carry.getOperand(0);
44295       if (Carry.getOpcode() == X86ISD::SETCC ||
44296           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
44297         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
44298         uint64_t CarryCC = Carry.getConstantOperandVal(0);
44299         SDValue CarryOp1 = Carry.getOperand(1);
44300         if (CarryCC == X86::COND_B)
44301           return CarryOp1;
44302         if (CarryCC == X86::COND_A) {
44303           // Try to convert COND_A into COND_B in an attempt to facilitate
44304           // materializing "setb reg".
44305           //
44306           // Do not flip "e > c", where "c" is a constant, because Cmp
44307           // instruction cannot take an immediate as its first operand.
44308           //
44309           if (CarryOp1.getOpcode() == X86ISD::SUB &&
44310               CarryOp1.getNode()->hasOneUse() &&
44311               CarryOp1.getValueType().isInteger() &&
44312               !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
44313             SDValue SubCommute =
44314                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
44315                             CarryOp1.getOperand(1), CarryOp1.getOperand(0));
44316             return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
44317           }
44318         }
44319         // If this is a check of the z flag of an add with 1, switch to the
44320         // C flag.
44321         if (CarryCC == X86::COND_E &&
44322             CarryOp1.getOpcode() == X86ISD::ADD &&
44323             isOneConstant(CarryOp1.getOperand(1)))
44324           return CarryOp1;
44325       }
44326     }
44327   }
44328 
44329   return SDValue();
44330 }
44331 
44332 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
44333 /// to avoid the inversion.
44334 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
44335                               SelectionDAG &DAG,
44336                               const X86Subtarget &Subtarget) {
44337   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
44338   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
44339       EFLAGS.getOpcode() != X86ISD::TESTP)
44340     return SDValue();
44341 
44342   // PTEST/TESTP sets EFLAGS as:
44343   // TESTZ: ZF = (Op0 & Op1) == 0
44344   // TESTC: CF = (~Op0 & Op1) == 0
44345   // TESTNZC: ZF == 0 && CF == 0
44346   EVT VT = EFLAGS.getValueType();
44347   SDValue Op0 = EFLAGS.getOperand(0);
44348   SDValue Op1 = EFLAGS.getOperand(1);
44349   EVT OpVT = Op0.getValueType();
44350 
44351   // TEST*(~X,Y) == TEST*(X,Y)
44352   if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
44353     X86::CondCode InvCC;
44354     switch (CC) {
44355     case X86::COND_B:
44356       // testc -> testz.
44357       InvCC = X86::COND_E;
44358       break;
44359     case X86::COND_AE:
44360       // !testc -> !testz.
44361       InvCC = X86::COND_NE;
44362       break;
44363     case X86::COND_E:
44364       // testz -> testc.
44365       InvCC = X86::COND_B;
44366       break;
44367     case X86::COND_NE:
44368       // !testz -> !testc.
44369       InvCC = X86::COND_AE;
44370       break;
44371     case X86::COND_A:
44372     case X86::COND_BE:
44373       // testnzc -> testnzc (no change).
44374       InvCC = CC;
44375       break;
44376     default:
44377       InvCC = X86::COND_INVALID;
44378       break;
44379     }
44380 
44381     if (InvCC != X86::COND_INVALID) {
44382       CC = InvCC;
44383       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44384                          DAG.getBitcast(OpVT, NotOp0), Op1);
44385     }
44386   }
44387 
44388   if (CC == X86::COND_E || CC == X86::COND_NE) {
44389     // TESTZ(X,~Y) == TESTC(Y,X)
44390     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
44391       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
44392       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44393                          DAG.getBitcast(OpVT, NotOp1), Op0);
44394     }
44395 
44396     if (Op0 == Op1) {
44397       SDValue BC = peekThroughBitcasts(Op0);
44398       EVT BCVT = BC.getValueType();
44399       assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
44400              "Unexpected vector type");
44401 
44402       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
44403       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
44404         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44405                            DAG.getBitcast(OpVT, BC.getOperand(0)),
44406                            DAG.getBitcast(OpVT, BC.getOperand(1)));
44407       }
44408 
44409       // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
44410       if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
44411         CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
44412         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44413                            DAG.getBitcast(OpVT, BC.getOperand(0)),
44414                            DAG.getBitcast(OpVT, BC.getOperand(1)));
44415       }
44416 
44417       // If every element is an all-sign value, see if we can use MOVMSK to
44418       // more efficiently extract the sign bits and compare that.
44419       // TODO: Handle TESTC with comparison inversion.
44420       // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
44421       // MOVMSK combines to make sure its never worse than PTEST?
44422       unsigned EltBits = BCVT.getScalarSizeInBits();
44423       if (DAG.ComputeNumSignBits(BC) == EltBits) {
44424         assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
44425         APInt SignMask = APInt::getSignMask(EltBits);
44426         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44427         if (SDValue Res =
44428                 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
44429           // For vXi16 cases we need to use pmovmksb and extract every other
44430           // sign bit.
44431           SDLoc DL(EFLAGS);
44432           if (EltBits == 16) {
44433             MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
44434             Res = DAG.getBitcast(MovmskVT, Res);
44435             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
44436             Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
44437                               DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
44438           } else {
44439             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
44440           }
44441           return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
44442                              DAG.getConstant(0, DL, MVT::i32));
44443         }
44444       }
44445     }
44446 
44447     // TESTZ(-1,X) == TESTZ(X,X)
44448     if (ISD::isBuildVectorAllOnes(Op0.getNode()))
44449       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
44450 
44451     // TESTZ(X,-1) == TESTZ(X,X)
44452     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
44453       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
44454 
44455     // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
44456     // TODO: Add COND_NE handling?
44457     if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
44458       SDValue Src0 = peekThroughBitcasts(Op0);
44459       SDValue Src1 = peekThroughBitcasts(Op1);
44460       if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
44461         Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
44462                                  peekThroughBitcasts(Src0.getOperand(1)), true);
44463         Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
44464                                  peekThroughBitcasts(Src1.getOperand(1)), true);
44465         if (Src0 && Src1)
44466           return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
44467                              DAG.getBitcast(MVT::v4i64, Src0),
44468                              DAG.getBitcast(MVT::v4i64, Src1));
44469       }
44470     }
44471   }
44472 
44473   return SDValue();
44474 }
44475 
44476 // Attempt to simplify the MOVMSK input based on the comparison type.
44477 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
44478                                   SelectionDAG &DAG,
44479                                   const X86Subtarget &Subtarget) {
44480   // Handle eq/ne against zero (any_of).
44481   // Handle eq/ne against -1 (all_of).
44482   if (!(CC == X86::COND_E || CC == X86::COND_NE))
44483     return SDValue();
44484   if (EFLAGS.getValueType() != MVT::i32)
44485     return SDValue();
44486   unsigned CmpOpcode = EFLAGS.getOpcode();
44487   if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
44488     return SDValue();
44489   auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
44490   if (!CmpConstant)
44491     return SDValue();
44492   const APInt &CmpVal = CmpConstant->getAPIntValue();
44493 
44494   SDValue CmpOp = EFLAGS.getOperand(0);
44495   unsigned CmpBits = CmpOp.getValueSizeInBits();
44496   assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
44497 
44498   // Peek through any truncate.
44499   if (CmpOp.getOpcode() == ISD::TRUNCATE)
44500     CmpOp = CmpOp.getOperand(0);
44501 
44502   // Bail if we don't find a MOVMSK.
44503   if (CmpOp.getOpcode() != X86ISD::MOVMSK)
44504     return SDValue();
44505 
44506   SDValue Vec = CmpOp.getOperand(0);
44507   MVT VecVT = Vec.getSimpleValueType();
44508   assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
44509          "Unexpected MOVMSK operand");
44510   unsigned NumElts = VecVT.getVectorNumElements();
44511   unsigned NumEltBits = VecVT.getScalarSizeInBits();
44512 
44513   bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
44514   bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
44515                  NumElts <= CmpBits && CmpVal.isMask(NumElts);
44516   if (!IsAnyOf && !IsAllOf)
44517     return SDValue();
44518 
44519   // See if we can peek through to a vector with a wider element type, if the
44520   // signbits extend down to all the sub-elements as well.
44521   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
44522   // potential SimplifyDemandedBits/Elts cases.
44523   // If we looked through a truncate that discard bits, we can't do this
44524   // transform.
44525   // FIXME: We could do this transform for truncates that discarded bits by
44526   // inserting an AND mask between the new MOVMSK and the CMP.
44527   if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
44528     SDValue BC = peekThroughBitcasts(Vec);
44529     MVT BCVT = BC.getSimpleValueType();
44530     unsigned BCNumElts = BCVT.getVectorNumElements();
44531     unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
44532     if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
44533         BCNumEltBits > NumEltBits &&
44534         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
44535       SDLoc DL(EFLAGS);
44536       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
44537       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
44538                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
44539                          DAG.getConstant(CmpMask, DL, MVT::i32));
44540     }
44541   }
44542 
44543   // MOVMSK(CONCAT(X,Y)) == 0 ->  MOVMSK(OR(X,Y)).
44544   // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
44545   // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
44546   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
44547   if (VecVT.is256BitVector() && NumElts <= CmpBits) {
44548     SmallVector<SDValue> Ops;
44549     if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
44550         Ops.size() == 2) {
44551       SDLoc DL(EFLAGS);
44552       EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
44553       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
44554       SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
44555                               DAG.getBitcast(SubVT, Ops[0]),
44556                               DAG.getBitcast(SubVT, Ops[1]));
44557       V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
44558       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
44559                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
44560                          DAG.getConstant(CmpMask, DL, MVT::i32));
44561     }
44562   }
44563 
44564   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
44565   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
44566   // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
44567   // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
44568   if (IsAllOf && Subtarget.hasSSE41()) {
44569     MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
44570     SDValue BC = peekThroughBitcasts(Vec);
44571     // Ensure MOVMSK was testing every signbit of BC.
44572     if (BC.getValueType().getVectorNumElements() <= NumElts) {
44573       if (BC.getOpcode() == X86ISD::PCMPEQ) {
44574         SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
44575                                 BC.getOperand(0), BC.getOperand(1));
44576         V = DAG.getBitcast(TestVT, V);
44577         return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
44578       }
44579       // Check for 256-bit split vector cases.
44580       if (BC.getOpcode() == ISD::AND &&
44581           BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
44582           BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
44583         SDValue LHS = BC.getOperand(0);
44584         SDValue RHS = BC.getOperand(1);
44585         LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
44586                           LHS.getOperand(0), LHS.getOperand(1));
44587         RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
44588                           RHS.getOperand(0), RHS.getOperand(1));
44589         LHS = DAG.getBitcast(TestVT, LHS);
44590         RHS = DAG.getBitcast(TestVT, RHS);
44591         SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
44592         return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
44593       }
44594     }
44595   }
44596 
44597   // See if we can avoid a PACKSS by calling MOVMSK on the sources.
44598   // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
44599   // sign bits prior to the comparison with zero unless we know that
44600   // the vXi16 splats the sign bit down to the lower i8 half.
44601   // TODO: Handle all_of patterns.
44602   if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
44603     SDValue VecOp0 = Vec.getOperand(0);
44604     SDValue VecOp1 = Vec.getOperand(1);
44605     bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
44606     bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
44607     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
44608     if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
44609       SDLoc DL(EFLAGS);
44610       SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
44611       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44612       Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
44613       if (!SignExt0) {
44614         Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
44615                              DAG.getConstant(0xAAAA, DL, MVT::i16));
44616       }
44617       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
44618                          DAG.getConstant(0, DL, MVT::i16));
44619     }
44620     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
44621     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
44622     if (CmpBits >= 16 && Subtarget.hasInt256() &&
44623         (IsAnyOf || (SignExt0 && SignExt1))) {
44624       if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
44625         SDLoc DL(EFLAGS);
44626         SDValue Result = peekThroughBitcasts(Src);
44627         if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
44628             Result.getValueType().getVectorNumElements() <= NumElts) {
44629           SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),
44630                                   Result.getOperand(0), Result.getOperand(1));
44631           V = DAG.getBitcast(MVT::v4i64, V);
44632           return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
44633         }
44634         Result = DAG.getBitcast(MVT::v32i8, Result);
44635         Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44636         unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
44637         if (!SignExt0 || !SignExt1) {
44638           assert(IsAnyOf &&
44639                  "Only perform v16i16 signmasks for any_of patterns");
44640           Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
44641                                DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
44642         }
44643         return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
44644                            DAG.getConstant(CmpMask, DL, MVT::i32));
44645       }
44646     }
44647   }
44648 
44649   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
44650   SmallVector<int, 32> ShuffleMask;
44651   SmallVector<SDValue, 2> ShuffleInputs;
44652   if (NumElts <= CmpBits &&
44653       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
44654                              ShuffleMask, DAG) &&
44655       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
44656       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
44657     unsigned NumShuffleElts = ShuffleMask.size();
44658     APInt DemandedElts = APInt::getZero(NumShuffleElts);
44659     for (int M : ShuffleMask) {
44660       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
44661       DemandedElts.setBit(M);
44662     }
44663     if (DemandedElts.isAllOnes()) {
44664       SDLoc DL(EFLAGS);
44665       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
44666       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44667       Result =
44668           DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
44669       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
44670                          EFLAGS.getOperand(1));
44671     }
44672   }
44673 
44674   return SDValue();
44675 }
44676 
44677 /// Optimize an EFLAGS definition used according to the condition code \p CC
44678 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
44679 /// uses of chain values.
44680 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
44681                                   SelectionDAG &DAG,
44682                                   const X86Subtarget &Subtarget) {
44683   if (CC == X86::COND_B)
44684     if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
44685       return Flags;
44686 
44687   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
44688     return R;
44689 
44690   if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
44691     return R;
44692 
44693   if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
44694     return R;
44695 
44696   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
44697 }
44698 
44699 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
44700 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
44701                            TargetLowering::DAGCombinerInfo &DCI,
44702                            const X86Subtarget &Subtarget) {
44703   SDLoc DL(N);
44704 
44705   SDValue FalseOp = N->getOperand(0);
44706   SDValue TrueOp = N->getOperand(1);
44707   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
44708   SDValue Cond = N->getOperand(3);
44709 
44710   // cmov X, X, ?, ? --> X
44711   if (TrueOp == FalseOp)
44712     return TrueOp;
44713 
44714   // Try to simplify the EFLAGS and condition code operands.
44715   // We can't always do this as FCMOV only supports a subset of X86 cond.
44716   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
44717     if (!(FalseOp.getValueType() == MVT::f80 ||
44718           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
44719           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
44720         !Subtarget.hasCMov() || hasFPCMov(CC)) {
44721       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
44722                        Flags};
44723       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
44724     }
44725   }
44726 
44727   // If this is a select between two integer constants, try to do some
44728   // optimizations.  Note that the operands are ordered the opposite of SELECT
44729   // operands.
44730   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
44731     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
44732       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
44733       // larger than FalseC (the false value).
44734       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
44735         CC = X86::GetOppositeBranchCondition(CC);
44736         std::swap(TrueC, FalseC);
44737         std::swap(TrueOp, FalseOp);
44738       }
44739 
44740       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
44741       // This is efficient for any integer data type (including i8/i16) and
44742       // shift amount.
44743       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
44744         Cond = getSETCC(CC, Cond, DL, DAG);
44745 
44746         // Zero extend the condition if needed.
44747         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
44748 
44749         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
44750         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
44751                            DAG.getConstant(ShAmt, DL, MVT::i8));
44752         return Cond;
44753       }
44754 
44755       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
44756       // for any integer data type, including i8/i16.
44757       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
44758         Cond = getSETCC(CC, Cond, DL, DAG);
44759 
44760         // Zero extend the condition if needed.
44761         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
44762                            FalseC->getValueType(0), Cond);
44763         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
44764                            SDValue(FalseC, 0));
44765         return Cond;
44766       }
44767 
44768       // Optimize cases that will turn into an LEA instruction.  This requires
44769       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
44770       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
44771         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
44772         assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
44773                "Implicit constant truncation");
44774 
44775         bool isFastMultiplier = false;
44776         if (Diff.ult(10)) {
44777           switch (Diff.getZExtValue()) {
44778           default: break;
44779           case 1:  // result = add base, cond
44780           case 2:  // result = lea base(    , cond*2)
44781           case 3:  // result = lea base(cond, cond*2)
44782           case 4:  // result = lea base(    , cond*4)
44783           case 5:  // result = lea base(cond, cond*4)
44784           case 8:  // result = lea base(    , cond*8)
44785           case 9:  // result = lea base(cond, cond*8)
44786             isFastMultiplier = true;
44787             break;
44788           }
44789         }
44790 
44791         if (isFastMultiplier) {
44792           Cond = getSETCC(CC, Cond, DL ,DAG);
44793           // Zero extend the condition if needed.
44794           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
44795                              Cond);
44796           // Scale the condition by the difference.
44797           if (Diff != 1)
44798             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
44799                                DAG.getConstant(Diff, DL, Cond.getValueType()));
44800 
44801           // Add the base if non-zero.
44802           if (FalseC->getAPIntValue() != 0)
44803             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
44804                                SDValue(FalseC, 0));
44805           return Cond;
44806         }
44807       }
44808     }
44809   }
44810 
44811   // Handle these cases:
44812   //   (select (x != c), e, c) -> select (x != c), e, x),
44813   //   (select (x == c), c, e) -> select (x == c), x, e)
44814   // where the c is an integer constant, and the "select" is the combination
44815   // of CMOV and CMP.
44816   //
44817   // The rationale for this change is that the conditional-move from a constant
44818   // needs two instructions, however, conditional-move from a register needs
44819   // only one instruction.
44820   //
44821   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
44822   //  some instruction-combining opportunities. This opt needs to be
44823   //  postponed as late as possible.
44824   //
44825   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
44826     // the DCI.xxxx conditions are provided to postpone the optimization as
44827     // late as possible.
44828 
44829     ConstantSDNode *CmpAgainst = nullptr;
44830     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
44831         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
44832         !isa<ConstantSDNode>(Cond.getOperand(0))) {
44833 
44834       if (CC == X86::COND_NE &&
44835           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
44836         CC = X86::GetOppositeBranchCondition(CC);
44837         std::swap(TrueOp, FalseOp);
44838       }
44839 
44840       if (CC == X86::COND_E &&
44841           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
44842         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
44843                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
44844         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
44845       }
44846     }
44847   }
44848 
44849   // Fold and/or of setcc's to double CMOV:
44850   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
44851   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
44852   //
44853   // This combine lets us generate:
44854   //   cmovcc1 (jcc1 if we don't have CMOV)
44855   //   cmovcc2 (same)
44856   // instead of:
44857   //   setcc1
44858   //   setcc2
44859   //   and/or
44860   //   cmovne (jne if we don't have CMOV)
44861   // When we can't use the CMOV instruction, it might increase branch
44862   // mispredicts.
44863   // When we can use CMOV, or when there is no mispredict, this improves
44864   // throughput and reduces register pressure.
44865   //
44866   if (CC == X86::COND_NE) {
44867     SDValue Flags;
44868     X86::CondCode CC0, CC1;
44869     bool isAndSetCC;
44870     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
44871       if (isAndSetCC) {
44872         std::swap(FalseOp, TrueOp);
44873         CC0 = X86::GetOppositeBranchCondition(CC0);
44874         CC1 = X86::GetOppositeBranchCondition(CC1);
44875       }
44876 
44877       SDValue LOps[] = {FalseOp, TrueOp,
44878                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
44879       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
44880       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
44881                        Flags};
44882       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
44883       return CMOV;
44884     }
44885   }
44886 
44887   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
44888   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
44889   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
44890   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
44891   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
44892       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
44893     SDValue Add = TrueOp;
44894     SDValue Const = FalseOp;
44895     // Canonicalize the condition code for easier matching and output.
44896     if (CC == X86::COND_E)
44897       std::swap(Add, Const);
44898 
44899     // We might have replaced the constant in the cmov with the LHS of the
44900     // compare. If so change it to the RHS of the compare.
44901     if (Const == Cond.getOperand(0))
44902       Const = Cond.getOperand(1);
44903 
44904     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
44905     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
44906         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
44907         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
44908          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
44909         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
44910       EVT VT = N->getValueType(0);
44911       // This should constant fold.
44912       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
44913       SDValue CMov =
44914           DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
44915                       DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
44916       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
44917     }
44918   }
44919 
44920   return SDValue();
44921 }
44922 
44923 /// Different mul shrinking modes.
44924 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
44925 
44926 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
44927   EVT VT = N->getOperand(0).getValueType();
44928   if (VT.getScalarSizeInBits() != 32)
44929     return false;
44930 
44931   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
44932   unsigned SignBits[2] = {1, 1};
44933   bool IsPositive[2] = {false, false};
44934   for (unsigned i = 0; i < 2; i++) {
44935     SDValue Opd = N->getOperand(i);
44936 
44937     SignBits[i] = DAG.ComputeNumSignBits(Opd);
44938     IsPositive[i] = DAG.SignBitIsZero(Opd);
44939   }
44940 
44941   bool AllPositive = IsPositive[0] && IsPositive[1];
44942   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
44943   // When ranges are from -128 ~ 127, use MULS8 mode.
44944   if (MinSignBits >= 25)
44945     Mode = ShrinkMode::MULS8;
44946   // When ranges are from 0 ~ 255, use MULU8 mode.
44947   else if (AllPositive && MinSignBits >= 24)
44948     Mode = ShrinkMode::MULU8;
44949   // When ranges are from -32768 ~ 32767, use MULS16 mode.
44950   else if (MinSignBits >= 17)
44951     Mode = ShrinkMode::MULS16;
44952   // When ranges are from 0 ~ 65535, use MULU16 mode.
44953   else if (AllPositive && MinSignBits >= 16)
44954     Mode = ShrinkMode::MULU16;
44955   else
44956     return false;
44957   return true;
44958 }
44959 
44960 /// When the operands of vector mul are extended from smaller size values,
44961 /// like i8 and i16, the type of mul may be shrinked to generate more
44962 /// efficient code. Two typical patterns are handled:
44963 /// Pattern1:
44964 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
44965 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
44966 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
44967 ///     %5 = mul <N x i32> %2, %4
44968 ///
44969 /// Pattern2:
44970 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
44971 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
44972 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
44973 ///     %5 = mul <N x i32> %2, %4
44974 ///
44975 /// There are four mul shrinking modes:
44976 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
44977 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
44978 /// generate pmullw+sext32 for it (MULS8 mode).
44979 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
44980 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
44981 /// generate pmullw+zext32 for it (MULU8 mode).
44982 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
44983 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
44984 /// generate pmullw+pmulhw for it (MULS16 mode).
44985 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
44986 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
44987 /// generate pmullw+pmulhuw for it (MULU16 mode).
44988 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
44989                                const X86Subtarget &Subtarget) {
44990   // Check for legality
44991   // pmullw/pmulhw are not supported by SSE.
44992   if (!Subtarget.hasSSE2())
44993     return SDValue();
44994 
44995   // Check for profitability
44996   // pmulld is supported since SSE41. It is better to use pmulld
44997   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
44998   // the expansion.
44999   bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
45000   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
45001     return SDValue();
45002 
45003   ShrinkMode Mode;
45004   if (!canReduceVMulWidth(N, DAG, Mode))
45005     return SDValue();
45006 
45007   SDLoc DL(N);
45008   SDValue N0 = N->getOperand(0);
45009   SDValue N1 = N->getOperand(1);
45010   EVT VT = N->getOperand(0).getValueType();
45011   unsigned NumElts = VT.getVectorNumElements();
45012   if ((NumElts % 2) != 0)
45013     return SDValue();
45014 
45015   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
45016 
45017   // Shrink the operands of mul.
45018   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
45019   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
45020 
45021   // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
45022   // lower part is needed.
45023   SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
45024   if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
45025     return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
45026                                                    : ISD::SIGN_EXTEND,
45027                        DL, VT, MulLo);
45028 
45029   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
45030   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
45031   // the higher part is also needed.
45032   SDValue MulHi =
45033       DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
45034                   ReducedVT, NewN0, NewN1);
45035 
45036   // Repack the lower part and higher part result of mul into a wider
45037   // result.
45038   // Generate shuffle functioning as punpcklwd.
45039   SmallVector<int, 16> ShuffleMask(NumElts);
45040   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
45041     ShuffleMask[2 * i] = i;
45042     ShuffleMask[2 * i + 1] = i + NumElts;
45043   }
45044   SDValue ResLo =
45045       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
45046   ResLo = DAG.getBitcast(ResVT, ResLo);
45047   // Generate shuffle functioning as punpckhwd.
45048   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
45049     ShuffleMask[2 * i] = i + NumElts / 2;
45050     ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
45051   }
45052   SDValue ResHi =
45053       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
45054   ResHi = DAG.getBitcast(ResVT, ResHi);
45055   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
45056 }
45057 
45058 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
45059                                  EVT VT, const SDLoc &DL) {
45060 
45061   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
45062     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45063                                  DAG.getConstant(Mult, DL, VT));
45064     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
45065                          DAG.getConstant(Shift, DL, MVT::i8));
45066     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
45067                          N->getOperand(0));
45068     return Result;
45069   };
45070 
45071   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
45072     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45073                                  DAG.getConstant(Mul1, DL, VT));
45074     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
45075                          DAG.getConstant(Mul2, DL, VT));
45076     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
45077                          N->getOperand(0));
45078     return Result;
45079   };
45080 
45081   switch (MulAmt) {
45082   default:
45083     break;
45084   case 11:
45085     // mul x, 11 => add ((shl (mul x, 5), 1), x)
45086     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
45087   case 21:
45088     // mul x, 21 => add ((shl (mul x, 5), 2), x)
45089     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
45090   case 41:
45091     // mul x, 41 => add ((shl (mul x, 5), 3), x)
45092     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
45093   case 22:
45094     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
45095     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
45096                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
45097   case 19:
45098     // mul x, 19 => add ((shl (mul x, 9), 1), x)
45099     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
45100   case 37:
45101     // mul x, 37 => add ((shl (mul x, 9), 2), x)
45102     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
45103   case 73:
45104     // mul x, 73 => add ((shl (mul x, 9), 3), x)
45105     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
45106   case 13:
45107     // mul x, 13 => add ((shl (mul x, 3), 2), x)
45108     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
45109   case 23:
45110     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
45111     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
45112   case 26:
45113     // mul x, 26 => add ((mul (mul x, 5), 5), x)
45114     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
45115   case 28:
45116     // mul x, 28 => add ((mul (mul x, 9), 3), x)
45117     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
45118   case 29:
45119     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
45120     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
45121                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
45122   }
45123 
45124   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
45125   // by a single LEA.
45126   // First check if this a sum of two power of 2s because that's easy. Then
45127   // count how many zeros are up to the first bit.
45128   // TODO: We can do this even without LEA at a cost of two shifts and an add.
45129   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
45130     unsigned ScaleShift = countTrailingZeros(MulAmt);
45131     if (ScaleShift >= 1 && ScaleShift < 4) {
45132       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
45133       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45134                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
45135       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45136                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
45137       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
45138     }
45139   }
45140 
45141   return SDValue();
45142 }
45143 
45144 // If the upper 17 bits of either element are zero and the other element are
45145 // zero/sign bits then we can use PMADDWD, which is always at least as quick as
45146 // PMULLD, except on KNL.
45147 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
45148                                    const X86Subtarget &Subtarget) {
45149   if (!Subtarget.hasSSE2())
45150     return SDValue();
45151 
45152   if (Subtarget.isPMADDWDSlow())
45153     return SDValue();
45154 
45155   EVT VT = N->getValueType(0);
45156 
45157   // Only support vXi32 vectors.
45158   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
45159     return SDValue();
45160 
45161   // Make sure the type is legal or can split/widen to a legal type.
45162   // With AVX512 but without BWI, we would need to split v32i16.
45163   unsigned NumElts = VT.getVectorNumElements();
45164   if (NumElts == 1 || !isPowerOf2_32(NumElts))
45165     return SDValue();
45166 
45167   EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
45168 
45169   // With AVX512 but without BWI, we would need to split v32i16.
45170   if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
45171     return SDValue();
45172 
45173   SDValue N0 = N->getOperand(0);
45174   SDValue N1 = N->getOperand(1);
45175 
45176   // If we are zero/sign extending two steps without SSE4.1, its better to
45177   // reduce the vmul width instead.
45178   if (!Subtarget.hasSSE41() &&
45179       (((N0.getOpcode() == ISD::ZERO_EXTEND &&
45180          N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
45181         (N1.getOpcode() == ISD::ZERO_EXTEND &&
45182          N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
45183        ((N0.getOpcode() == ISD::SIGN_EXTEND &&
45184          N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
45185         (N1.getOpcode() == ISD::SIGN_EXTEND &&
45186          N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
45187     return SDValue();
45188 
45189   // If we are sign extending a wide vector without SSE4.1, its better to reduce
45190   // the vmul width instead.
45191   if (!Subtarget.hasSSE41() &&
45192       (N0.getOpcode() == ISD::SIGN_EXTEND &&
45193        N0.getOperand(0).getValueSizeInBits() > 128) &&
45194       (N1.getOpcode() == ISD::SIGN_EXTEND &&
45195        N1.getOperand(0).getValueSizeInBits() > 128))
45196     return SDValue();
45197 
45198   // Sign bits must extend down to the lowest i16.
45199   if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
45200       DAG.ComputeMaxSignificantBits(N0) > 16)
45201     return SDValue();
45202 
45203   // At least one of the elements must be zero in the upper 17 bits, or can be
45204   // safely made zero without altering the final result.
45205   auto GetZeroableOp = [&](SDValue Op) {
45206     APInt Mask17 = APInt::getHighBitsSet(32, 17);
45207     if (DAG.MaskedValueIsZero(Op, Mask17))
45208       return Op;
45209     // Mask off upper 16-bits of sign-extended constants.
45210     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
45211       return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
45212                          DAG.getConstant(0xFFFF, SDLoc(N), VT));
45213     if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
45214       SDValue Src = Op.getOperand(0);
45215       // Convert sext(vXi16) to zext(vXi16).
45216       if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
45217         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
45218       // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
45219       // which will expand the extension.
45220       if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
45221         EVT ExtVT = VT.changeVectorElementType(MVT::i16);
45222         Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
45223         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
45224       }
45225     }
45226     // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
45227     if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
45228         N->isOnlyUserOf(Op.getNode())) {
45229       SDValue Src = Op.getOperand(0);
45230       if (Src.getScalarValueSizeInBits() == 16)
45231         return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
45232     }
45233     // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
45234     if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
45235         N->isOnlyUserOf(Op.getNode())) {
45236       return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
45237                          Op.getOperand(1));
45238     }
45239     return SDValue();
45240   };
45241   SDValue ZeroN0 = GetZeroableOp(N0);
45242   SDValue ZeroN1 = GetZeroableOp(N1);
45243   if (!ZeroN0 && !ZeroN1)
45244     return SDValue();
45245   N0 = ZeroN0 ? ZeroN0 : N0;
45246   N1 = ZeroN1 ? ZeroN1 : N1;
45247 
45248   // Use SplitOpsAndApply to handle AVX splitting.
45249   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45250                            ArrayRef<SDValue> Ops) {
45251     MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45252     return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
45253   };
45254   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
45255                           { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
45256                           PMADDWDBuilder);
45257 }
45258 
45259 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
45260                                   const X86Subtarget &Subtarget) {
45261   if (!Subtarget.hasSSE2())
45262     return SDValue();
45263 
45264   EVT VT = N->getValueType(0);
45265 
45266   // Only support vXi64 vectors.
45267   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
45268       VT.getVectorNumElements() < 2 ||
45269       !isPowerOf2_32(VT.getVectorNumElements()))
45270     return SDValue();
45271 
45272   SDValue N0 = N->getOperand(0);
45273   SDValue N1 = N->getOperand(1);
45274 
45275   // MULDQ returns the 64-bit result of the signed multiplication of the lower
45276   // 32-bits. We can lower with this if the sign bits stretch that far.
45277   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
45278       DAG.ComputeNumSignBits(N1) > 32) {
45279     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45280                             ArrayRef<SDValue> Ops) {
45281       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
45282     };
45283     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
45284                             PMULDQBuilder, /*CheckBWI*/false);
45285   }
45286 
45287   // If the upper bits are zero we can use a single pmuludq.
45288   APInt Mask = APInt::getHighBitsSet(64, 32);
45289   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
45290     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45291                              ArrayRef<SDValue> Ops) {
45292       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
45293     };
45294     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
45295                             PMULUDQBuilder, /*CheckBWI*/false);
45296   }
45297 
45298   return SDValue();
45299 }
45300 
45301 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
45302                           TargetLowering::DAGCombinerInfo &DCI,
45303                           const X86Subtarget &Subtarget) {
45304   EVT VT = N->getValueType(0);
45305 
45306   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
45307     return V;
45308 
45309   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
45310     return V;
45311 
45312   if (DCI.isBeforeLegalize() && VT.isVector())
45313     return reduceVMULWidth(N, DAG, Subtarget);
45314 
45315   // Optimize a single multiply with constant into two operations in order to
45316   // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
45317   if (!MulConstantOptimization)
45318     return SDValue();
45319 
45320   // An imul is usually smaller than the alternative sequence.
45321   if (DAG.getMachineFunction().getFunction().hasMinSize())
45322     return SDValue();
45323 
45324   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
45325     return SDValue();
45326 
45327   if (VT != MVT::i64 && VT != MVT::i32)
45328     return SDValue();
45329 
45330   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
45331   if (!C)
45332     return SDValue();
45333   if (isPowerOf2_64(C->getZExtValue()))
45334     return SDValue();
45335 
45336   int64_t SignMulAmt = C->getSExtValue();
45337   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
45338   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
45339 
45340   SDLoc DL(N);
45341   if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
45342     SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45343                                  DAG.getConstant(AbsMulAmt, DL, VT));
45344     if (SignMulAmt < 0)
45345       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
45346                            NewMul);
45347 
45348     return NewMul;
45349   }
45350 
45351   uint64_t MulAmt1 = 0;
45352   uint64_t MulAmt2 = 0;
45353   if ((AbsMulAmt % 9) == 0) {
45354     MulAmt1 = 9;
45355     MulAmt2 = AbsMulAmt / 9;
45356   } else if ((AbsMulAmt % 5) == 0) {
45357     MulAmt1 = 5;
45358     MulAmt2 = AbsMulAmt / 5;
45359   } else if ((AbsMulAmt % 3) == 0) {
45360     MulAmt1 = 3;
45361     MulAmt2 = AbsMulAmt / 3;
45362   }
45363 
45364   SDValue NewMul;
45365   // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
45366   if (MulAmt2 &&
45367       (isPowerOf2_64(MulAmt2) ||
45368        (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
45369 
45370     if (isPowerOf2_64(MulAmt2) &&
45371         !(SignMulAmt >= 0 && N->hasOneUse() &&
45372           N->use_begin()->getOpcode() == ISD::ADD))
45373       // If second multiplifer is pow2, issue it first. We want the multiply by
45374       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
45375       // is an add. Only do this for positive multiply amounts since the
45376       // negate would prevent it from being used as an address mode anyway.
45377       std::swap(MulAmt1, MulAmt2);
45378 
45379     if (isPowerOf2_64(MulAmt1))
45380       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45381                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
45382     else
45383       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
45384                            DAG.getConstant(MulAmt1, DL, VT));
45385 
45386     if (isPowerOf2_64(MulAmt2))
45387       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
45388                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
45389     else
45390       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
45391                            DAG.getConstant(MulAmt2, DL, VT));
45392 
45393     // Negate the result.
45394     if (SignMulAmt < 0)
45395       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
45396                            NewMul);
45397   } else if (!Subtarget.slowLEA())
45398     NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
45399 
45400   if (!NewMul) {
45401     assert(C->getZExtValue() != 0 &&
45402            C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
45403            "Both cases that could cause potential overflows should have "
45404            "already been handled.");
45405     if (isPowerOf2_64(AbsMulAmt - 1)) {
45406       // (mul x, 2^N + 1) => (add (shl x, N), x)
45407       NewMul = DAG.getNode(
45408           ISD::ADD, DL, VT, N->getOperand(0),
45409           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45410                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
45411                                       MVT::i8)));
45412       // To negate, subtract the number from zero
45413       if (SignMulAmt < 0)
45414         NewMul = DAG.getNode(ISD::SUB, DL, VT,
45415                              DAG.getConstant(0, DL, VT), NewMul);
45416     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
45417       // (mul x, 2^N - 1) => (sub (shl x, N), x)
45418       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45419                            DAG.getConstant(Log2_64(AbsMulAmt + 1),
45420                                            DL, MVT::i8));
45421       // To negate, reverse the operands of the subtract.
45422       if (SignMulAmt < 0)
45423         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
45424       else
45425         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45426     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
45427       // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
45428       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45429                            DAG.getConstant(Log2_64(AbsMulAmt - 2),
45430                                            DL, MVT::i8));
45431       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
45432       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
45433     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
45434       // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
45435       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
45436                            DAG.getConstant(Log2_64(AbsMulAmt + 2),
45437                                            DL, MVT::i8));
45438       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45439       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
45440     }
45441   }
45442 
45443   return NewMul;
45444 }
45445 
45446 // Try to form a MULHU or MULHS node by looking for
45447 // (srl (mul ext, ext), 16)
45448 // TODO: This is X86 specific because we want to be able to handle wide types
45449 // before type legalization. But we can only do it if the vector will be
45450 // legalized via widening/splitting. Type legalization can't handle promotion
45451 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
45452 // combiner.
45453 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
45454                                    const X86Subtarget &Subtarget) {
45455   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
45456            "SRL or SRA node is required here!");
45457   SDLoc DL(N);
45458 
45459   if (!Subtarget.hasSSE2())
45460     return SDValue();
45461 
45462   // The operation feeding into the shift must be a multiply.
45463   SDValue ShiftOperand = N->getOperand(0);
45464   if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
45465     return SDValue();
45466 
45467   // Input type should be at least vXi32.
45468   EVT VT = N->getValueType(0);
45469   if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
45470     return SDValue();
45471 
45472   // Need a shift by 16.
45473   APInt ShiftAmt;
45474   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
45475       ShiftAmt != 16)
45476     return SDValue();
45477 
45478   SDValue LHS = ShiftOperand.getOperand(0);
45479   SDValue RHS = ShiftOperand.getOperand(1);
45480 
45481   unsigned ExtOpc = LHS.getOpcode();
45482   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
45483       RHS.getOpcode() != ExtOpc)
45484     return SDValue();
45485 
45486   // Peek through the extends.
45487   LHS = LHS.getOperand(0);
45488   RHS = RHS.getOperand(0);
45489 
45490   // Ensure the input types match.
45491   EVT MulVT = LHS.getValueType();
45492   if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
45493     return SDValue();
45494 
45495   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
45496   SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
45497 
45498   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
45499   return DAG.getNode(ExtOpc, DL, VT, Mulh);
45500 }
45501 
45502 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
45503   SDValue N0 = N->getOperand(0);
45504   SDValue N1 = N->getOperand(1);
45505   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
45506   EVT VT = N0.getValueType();
45507 
45508   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
45509   // since the result of setcc_c is all zero's or all ones.
45510   if (VT.isInteger() && !VT.isVector() &&
45511       N1C && N0.getOpcode() == ISD::AND &&
45512       N0.getOperand(1).getOpcode() == ISD::Constant) {
45513     SDValue N00 = N0.getOperand(0);
45514     APInt Mask = N0.getConstantOperandAPInt(1);
45515     Mask <<= N1C->getAPIntValue();
45516     bool MaskOK = false;
45517     // We can handle cases concerning bit-widening nodes containing setcc_c if
45518     // we carefully interrogate the mask to make sure we are semantics
45519     // preserving.
45520     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
45521     // of the underlying setcc_c operation if the setcc_c was zero extended.
45522     // Consider the following example:
45523     //   zext(setcc_c)                 -> i32 0x0000FFFF
45524     //   c1                            -> i32 0x0000FFFF
45525     //   c2                            -> i32 0x00000001
45526     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
45527     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
45528     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
45529       MaskOK = true;
45530     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
45531                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
45532       MaskOK = true;
45533     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
45534                 N00.getOpcode() == ISD::ANY_EXTEND) &&
45535                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
45536       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
45537     }
45538     if (MaskOK && Mask != 0) {
45539       SDLoc DL(N);
45540       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
45541     }
45542   }
45543 
45544   // Hardware support for vector shifts is sparse which makes us scalarize the
45545   // vector operations in many cases. Also, on sandybridge ADD is faster than
45546   // shl.
45547   // (shl V, 1) -> add V,V
45548   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
45549     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
45550       assert(N0.getValueType().isVector() && "Invalid vector shift type");
45551       // We shift all of the values by one. In many cases we do not have
45552       // hardware support for this operation. This is better expressed as an ADD
45553       // of two values.
45554       if (N1SplatC->isOne())
45555         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
45556     }
45557 
45558   return SDValue();
45559 }
45560 
45561 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
45562                                            const X86Subtarget &Subtarget) {
45563   SDValue N0 = N->getOperand(0);
45564   SDValue N1 = N->getOperand(1);
45565   EVT VT = N0.getValueType();
45566   unsigned Size = VT.getSizeInBits();
45567 
45568   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
45569     return V;
45570 
45571   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
45572   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
45573   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
45574   // depending on sign of (SarConst - [56,48,32,24,16])
45575 
45576   // sexts in X86 are MOVs. The MOVs have the same code size
45577   // as above SHIFTs (only SHIFT on 1 has lower code size).
45578   // However the MOVs have 2 advantages to a SHIFT:
45579   // 1. MOVs can write to a register that differs from source
45580   // 2. MOVs accept memory operands
45581 
45582   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
45583       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
45584       N0.getOperand(1).getOpcode() != ISD::Constant)
45585     return SDValue();
45586 
45587   SDValue N00 = N0.getOperand(0);
45588   SDValue N01 = N0.getOperand(1);
45589   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
45590   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
45591   EVT CVT = N1.getValueType();
45592 
45593   if (SarConst.isNegative())
45594     return SDValue();
45595 
45596   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
45597     unsigned ShiftSize = SVT.getSizeInBits();
45598     // skipping types without corresponding sext/zext and
45599     // ShlConst that is not one of [56,48,32,24,16]
45600     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
45601       continue;
45602     SDLoc DL(N);
45603     SDValue NN =
45604         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
45605     SarConst = SarConst - (Size - ShiftSize);
45606     if (SarConst == 0)
45607       return NN;
45608     else if (SarConst.isNegative())
45609       return DAG.getNode(ISD::SHL, DL, VT, NN,
45610                          DAG.getConstant(-SarConst, DL, CVT));
45611     else
45612       return DAG.getNode(ISD::SRA, DL, VT, NN,
45613                          DAG.getConstant(SarConst, DL, CVT));
45614   }
45615   return SDValue();
45616 }
45617 
45618 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
45619                                         TargetLowering::DAGCombinerInfo &DCI,
45620                                         const X86Subtarget &Subtarget) {
45621   SDValue N0 = N->getOperand(0);
45622   SDValue N1 = N->getOperand(1);
45623   EVT VT = N0.getValueType();
45624 
45625   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
45626     return V;
45627 
45628   // Only do this on the last DAG combine as it can interfere with other
45629   // combines.
45630   if (!DCI.isAfterLegalizeDAG())
45631     return SDValue();
45632 
45633   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
45634   // TODO: This is a generic DAG combine that became an x86-only combine to
45635   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
45636   // and-not ('andn').
45637   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
45638     return SDValue();
45639 
45640   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
45641   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
45642   if (!ShiftC || !AndC)
45643     return SDValue();
45644 
45645   // If we can shrink the constant mask below 8-bits or 32-bits, then this
45646   // transform should reduce code size. It may also enable secondary transforms
45647   // from improved known-bits analysis or instruction selection.
45648   APInt MaskVal = AndC->getAPIntValue();
45649 
45650   // If this can be matched by a zero extend, don't optimize.
45651   if (MaskVal.isMask()) {
45652     unsigned TO = MaskVal.countTrailingOnes();
45653     if (TO >= 8 && isPowerOf2_32(TO))
45654       return SDValue();
45655   }
45656 
45657   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
45658   unsigned OldMaskSize = MaskVal.getMinSignedBits();
45659   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
45660   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
45661       (OldMaskSize > 32 && NewMaskSize <= 32)) {
45662     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
45663     SDLoc DL(N);
45664     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
45665     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
45666     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
45667   }
45668   return SDValue();
45669 }
45670 
45671 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
45672                                          const X86Subtarget &Subtarget) {
45673   unsigned Opcode = N->getOpcode();
45674   assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
45675 
45676   SDLoc DL(N);
45677   EVT VT = N->getValueType(0);
45678   SDValue N0 = N->getOperand(0);
45679   SDValue N1 = N->getOperand(1);
45680   EVT SrcVT = N0.getValueType();
45681 
45682   SDValue BC0 =
45683       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
45684   SDValue BC1 =
45685       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
45686 
45687   // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
45688   // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
45689   // truncation trees that help us avoid lane crossing shuffles.
45690   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
45691   // TODO: We don't handle vXf64 shuffles yet.
45692   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
45693     if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
45694       SmallVector<SDValue> ShuffleOps;
45695       SmallVector<int> ShuffleMask, ScaledMask;
45696       SDValue Vec = peekThroughBitcasts(BCSrc);
45697       if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
45698         resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
45699         // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
45700         // shuffle to a v4X64 width - we can probably relax this in the future.
45701         if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
45702             ShuffleOps[0].getValueType().is256BitVector() &&
45703             scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
45704           SDValue Lo, Hi;
45705           MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
45706           std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
45707           Lo = DAG.getBitcast(SrcVT, Lo);
45708           Hi = DAG.getBitcast(SrcVT, Hi);
45709           SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
45710           Res = DAG.getBitcast(ShufVT, Res);
45711           Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
45712           return DAG.getBitcast(VT, Res);
45713         }
45714       }
45715     }
45716   }
45717 
45718   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
45719   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
45720     // If either/both ops are a shuffle that can scale to v2x64,
45721     // then see if we can perform this as a v4x32 post shuffle.
45722     SmallVector<SDValue> Ops0, Ops1;
45723     SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
45724     bool IsShuf0 =
45725         getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
45726         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
45727         all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
45728     bool IsShuf1 =
45729         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
45730         scaleShuffleElements(Mask1, 2, ScaledMask1) &&
45731         all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
45732     if (IsShuf0 || IsShuf1) {
45733       if (!IsShuf0) {
45734         Ops0.assign({BC0});
45735         ScaledMask0.assign({0, 1});
45736       }
45737       if (!IsShuf1) {
45738         Ops1.assign({BC1});
45739         ScaledMask1.assign({0, 1});
45740       }
45741 
45742       SDValue LHS, RHS;
45743       int PostShuffle[4] = {-1, -1, -1, -1};
45744       auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
45745         if (M < 0)
45746           return true;
45747         Idx = M % 2;
45748         SDValue Src = Ops[M / 2];
45749         if (!LHS || LHS == Src) {
45750           LHS = Src;
45751           return true;
45752         }
45753         if (!RHS || RHS == Src) {
45754           Idx += 2;
45755           RHS = Src;
45756           return true;
45757         }
45758         return false;
45759       };
45760       if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
45761           FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
45762           FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
45763           FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
45764         LHS = DAG.getBitcast(SrcVT, LHS);
45765         RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
45766         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
45767         SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
45768         Res = DAG.getBitcast(ShufVT, Res);
45769         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
45770         return DAG.getBitcast(VT, Res);
45771       }
45772     }
45773   }
45774 
45775   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
45776   if (VT.is256BitVector() && Subtarget.hasInt256()) {
45777     SmallVector<int> Mask0, Mask1;
45778     SmallVector<SDValue> Ops0, Ops1;
45779     SmallVector<int, 2> ScaledMask0, ScaledMask1;
45780     if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
45781         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
45782         !Ops0.empty() && !Ops1.empty() &&
45783         all_of(Ops0,
45784                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
45785         all_of(Ops1,
45786                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
45787         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
45788         scaleShuffleElements(Mask1, 2, ScaledMask1)) {
45789       SDValue Op00 = peekThroughBitcasts(Ops0.front());
45790       SDValue Op10 = peekThroughBitcasts(Ops1.front());
45791       SDValue Op01 = peekThroughBitcasts(Ops0.back());
45792       SDValue Op11 = peekThroughBitcasts(Ops1.back());
45793       if ((Op00 == Op11) && (Op01 == Op10)) {
45794         std::swap(Op10, Op11);
45795         ShuffleVectorSDNode::commuteMask(ScaledMask1);
45796       }
45797       if ((Op00 == Op10) && (Op01 == Op11)) {
45798         const int Map[4] = {0, 2, 1, 3};
45799         SmallVector<int, 4> ShuffleMask(
45800             {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
45801              Map[ScaledMask1[1]]});
45802         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
45803         SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
45804                                   DAG.getBitcast(SrcVT, Op01));
45805         Res = DAG.getBitcast(ShufVT, Res);
45806         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
45807         return DAG.getBitcast(VT, Res);
45808       }
45809     }
45810   }
45811 
45812   return SDValue();
45813 }
45814 
45815 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
45816                                  TargetLowering::DAGCombinerInfo &DCI,
45817                                  const X86Subtarget &Subtarget) {
45818   unsigned Opcode = N->getOpcode();
45819   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
45820          "Unexpected pack opcode");
45821 
45822   EVT VT = N->getValueType(0);
45823   SDValue N0 = N->getOperand(0);
45824   SDValue N1 = N->getOperand(1);
45825   unsigned NumDstElts = VT.getVectorNumElements();
45826   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
45827   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
45828   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
45829          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
45830          "Unexpected PACKSS/PACKUS input type");
45831 
45832   bool IsSigned = (X86ISD::PACKSS == Opcode);
45833 
45834   // Constant Folding.
45835   APInt UndefElts0, UndefElts1;
45836   SmallVector<APInt, 32> EltBits0, EltBits1;
45837   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
45838       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
45839       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
45840       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
45841     unsigned NumLanes = VT.getSizeInBits() / 128;
45842     unsigned NumSrcElts = NumDstElts / 2;
45843     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
45844     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
45845 
45846     APInt Undefs(NumDstElts, 0);
45847     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
45848     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
45849       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
45850         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
45851         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
45852         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
45853 
45854         if (UndefElts[SrcIdx]) {
45855           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
45856           continue;
45857         }
45858 
45859         APInt &Val = EltBits[SrcIdx];
45860         if (IsSigned) {
45861           // PACKSS: Truncate signed value with signed saturation.
45862           // Source values less than dst minint are saturated to minint.
45863           // Source values greater than dst maxint are saturated to maxint.
45864           if (Val.isSignedIntN(DstBitsPerElt))
45865             Val = Val.trunc(DstBitsPerElt);
45866           else if (Val.isNegative())
45867             Val = APInt::getSignedMinValue(DstBitsPerElt);
45868           else
45869             Val = APInt::getSignedMaxValue(DstBitsPerElt);
45870         } else {
45871           // PACKUS: Truncate signed value with unsigned saturation.
45872           // Source values less than zero are saturated to zero.
45873           // Source values greater than dst maxuint are saturated to maxuint.
45874           if (Val.isIntN(DstBitsPerElt))
45875             Val = Val.trunc(DstBitsPerElt);
45876           else if (Val.isNegative())
45877             Val = APInt::getZero(DstBitsPerElt);
45878           else
45879             Val = APInt::getAllOnes(DstBitsPerElt);
45880         }
45881         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
45882       }
45883     }
45884 
45885     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
45886   }
45887 
45888   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
45889   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
45890     return V;
45891 
45892   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
45893   // truncate to create a larger truncate.
45894   if (Subtarget.hasAVX512() &&
45895       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
45896       N0.getOperand(0).getValueType() == MVT::v8i32) {
45897     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
45898         (!IsSigned &&
45899          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
45900       if (Subtarget.hasVLX())
45901         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
45902 
45903       // Widen input to v16i32 so we can truncate that.
45904       SDLoc dl(N);
45905       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
45906                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
45907       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
45908     }
45909   }
45910 
45911   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
45912   if (VT.is128BitVector()) {
45913     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
45914     SDValue Src0, Src1;
45915     if (N0.getOpcode() == ExtOpc &&
45916         N0.getOperand(0).getValueType().is64BitVector() &&
45917         N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
45918       Src0 = N0.getOperand(0);
45919     }
45920     if (N1.getOpcode() == ExtOpc &&
45921         N1.getOperand(0).getValueType().is64BitVector() &&
45922         N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
45923       Src1 = N1.getOperand(0);
45924     }
45925     if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
45926       assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
45927       Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
45928       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
45929       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
45930     }
45931 
45932     // Try again with pack(*_extend_vector_inreg, undef).
45933     unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
45934                                     : ISD::ZERO_EXTEND_VECTOR_INREG;
45935     if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
45936         N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
45937       return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
45938                                     DAG);
45939   }
45940 
45941   // Attempt to combine as shuffle.
45942   SDValue Op(N, 0);
45943   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45944     return Res;
45945 
45946   return SDValue();
45947 }
45948 
45949 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
45950                                     TargetLowering::DAGCombinerInfo &DCI,
45951                                     const X86Subtarget &Subtarget) {
45952   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
45953           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
45954          "Unexpected horizontal add/sub opcode");
45955 
45956   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
45957     MVT VT = N->getSimpleValueType(0);
45958     SDValue LHS = N->getOperand(0);
45959     SDValue RHS = N->getOperand(1);
45960 
45961     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
45962     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
45963         LHS.getOpcode() == RHS.getOpcode() &&
45964         LHS.getValueType() == RHS.getValueType() &&
45965         N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
45966       SDValue LHS0 = LHS.getOperand(0);
45967       SDValue LHS1 = LHS.getOperand(1);
45968       SDValue RHS0 = RHS.getOperand(0);
45969       SDValue RHS1 = RHS.getOperand(1);
45970       if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
45971           (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
45972         SDLoc DL(N);
45973         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
45974                                   LHS0.isUndef() ? LHS1 : LHS0,
45975                                   RHS0.isUndef() ? RHS1 : RHS0);
45976         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
45977         Res = DAG.getBitcast(ShufVT, Res);
45978         SDValue NewLHS =
45979             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
45980                         getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
45981         SDValue NewRHS =
45982             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
45983                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
45984         return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
45985                            DAG.getBitcast(VT, NewRHS));
45986       }
45987     }
45988   }
45989 
45990   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
45991   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
45992     return V;
45993 
45994   return SDValue();
45995 }
45996 
45997 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
45998                                      TargetLowering::DAGCombinerInfo &DCI,
45999                                      const X86Subtarget &Subtarget) {
46000   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
46001           X86ISD::VSRL == N->getOpcode()) &&
46002          "Unexpected shift opcode");
46003   EVT VT = N->getValueType(0);
46004   SDValue N0 = N->getOperand(0);
46005   SDValue N1 = N->getOperand(1);
46006 
46007   // Shift zero -> zero.
46008   if (ISD::isBuildVectorAllZeros(N0.getNode()))
46009     return DAG.getConstant(0, SDLoc(N), VT);
46010 
46011   // Detect constant shift amounts.
46012   APInt UndefElts;
46013   SmallVector<APInt, 32> EltBits;
46014   if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
46015     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
46016     return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
46017                                       EltBits[0].getZExtValue(), DAG);
46018   }
46019 
46020   APInt KnownUndef, KnownZero;
46021   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46022   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
46023   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
46024                                      KnownZero, DCI))
46025     return SDValue(N, 0);
46026 
46027   return SDValue();
46028 }
46029 
46030 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
46031                                      TargetLowering::DAGCombinerInfo &DCI,
46032                                      const X86Subtarget &Subtarget) {
46033   unsigned Opcode = N->getOpcode();
46034   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
46035           X86ISD::VSRLI == Opcode) &&
46036          "Unexpected shift opcode");
46037   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
46038   EVT VT = N->getValueType(0);
46039   SDValue N0 = N->getOperand(0);
46040   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
46041   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
46042          "Unexpected value type");
46043   assert(N->getOperand(1).getValueType() == MVT::i8 &&
46044          "Unexpected shift amount type");
46045 
46046   // (shift undef, X) -> 0
46047   if (N0.isUndef())
46048     return DAG.getConstant(0, SDLoc(N), VT);
46049 
46050   // Out of range logical bit shifts are guaranteed to be zero.
46051   // Out of range arithmetic bit shifts splat the sign bit.
46052   unsigned ShiftVal = N->getConstantOperandVal(1);
46053   if (ShiftVal >= NumBitsPerElt) {
46054     if (LogicalShift)
46055       return DAG.getConstant(0, SDLoc(N), VT);
46056     ShiftVal = NumBitsPerElt - 1;
46057   }
46058 
46059   // (shift X, 0) -> X
46060   if (!ShiftVal)
46061     return N0;
46062 
46063   // (shift 0, C) -> 0
46064   if (ISD::isBuildVectorAllZeros(N0.getNode()))
46065     // N0 is all zeros or undef. We guarantee that the bits shifted into the
46066     // result are all zeros, not undef.
46067     return DAG.getConstant(0, SDLoc(N), VT);
46068 
46069   // (VSRAI -1, C) -> -1
46070   if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
46071     // N0 is all ones or undef. We guarantee that the bits shifted into the
46072     // result are all ones, not undef.
46073     return DAG.getConstant(-1, SDLoc(N), VT);
46074 
46075   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
46076   if (Opcode == N0.getOpcode()) {
46077     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
46078     unsigned NewShiftVal = ShiftVal + ShiftVal2;
46079     if (NewShiftVal >= NumBitsPerElt) {
46080       // Out of range logical bit shifts are guaranteed to be zero.
46081       // Out of range arithmetic bit shifts splat the sign bit.
46082       if (LogicalShift)
46083         return DAG.getConstant(0, SDLoc(N), VT);
46084       NewShiftVal = NumBitsPerElt - 1;
46085     }
46086     return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
46087                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
46088   }
46089 
46090   // We can decode 'whole byte' logical bit shifts as shuffles.
46091   if (LogicalShift && (ShiftVal % 8) == 0) {
46092     SDValue Op(N, 0);
46093     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46094       return Res;
46095   }
46096 
46097   // Constant Folding.
46098   APInt UndefElts;
46099   SmallVector<APInt, 32> EltBits;
46100   if (N->isOnlyUserOf(N0.getNode()) &&
46101       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
46102     assert(EltBits.size() == VT.getVectorNumElements() &&
46103            "Unexpected shift value type");
46104     // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
46105     // created an undef input due to no input bits being demanded, but user
46106     // still expects 0 in other bits.
46107     for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
46108       APInt &Elt = EltBits[i];
46109       if (UndefElts[i])
46110         Elt = 0;
46111       else if (X86ISD::VSHLI == Opcode)
46112         Elt <<= ShiftVal;
46113       else if (X86ISD::VSRAI == Opcode)
46114         Elt.ashrInPlace(ShiftVal);
46115       else
46116         Elt.lshrInPlace(ShiftVal);
46117     }
46118     // Reset undef elements since they were zeroed above.
46119     UndefElts = 0;
46120     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
46121   }
46122 
46123   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46124   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
46125                                DCI))
46126     return SDValue(N, 0);
46127 
46128   return SDValue();
46129 }
46130 
46131 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
46132                                    TargetLowering::DAGCombinerInfo &DCI,
46133                                    const X86Subtarget &Subtarget) {
46134   EVT VT = N->getValueType(0);
46135   assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
46136           (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
46137           N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
46138          "Unexpected vector insertion");
46139 
46140   if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
46141     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
46142     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46143     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
46144                                  APInt::getAllOnes(NumBitsPerElt), DCI))
46145       return SDValue(N, 0);
46146   }
46147 
46148   // Attempt to combine insertion patterns to a shuffle.
46149   if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
46150     SDValue Op(N, 0);
46151     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46152       return Res;
46153   }
46154 
46155   return SDValue();
46156 }
46157 
46158 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
46159 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
46160 /// OR -> CMPNEQSS.
46161 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
46162                                    TargetLowering::DAGCombinerInfo &DCI,
46163                                    const X86Subtarget &Subtarget) {
46164   unsigned opcode;
46165 
46166   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
46167   // we're requiring SSE2 for both.
46168   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
46169     SDValue N0 = N->getOperand(0);
46170     SDValue N1 = N->getOperand(1);
46171     SDValue CMP0 = N0.getOperand(1);
46172     SDValue CMP1 = N1.getOperand(1);
46173     SDLoc DL(N);
46174 
46175     // The SETCCs should both refer to the same CMP.
46176     if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
46177       return SDValue();
46178 
46179     SDValue CMP00 = CMP0->getOperand(0);
46180     SDValue CMP01 = CMP0->getOperand(1);
46181     EVT     VT    = CMP00.getValueType();
46182 
46183     if (VT == MVT::f32 || VT == MVT::f64 ||
46184         (VT == MVT::f16 && Subtarget.hasFP16())) {
46185       bool ExpectingFlags = false;
46186       // Check for any users that want flags:
46187       for (const SDNode *U : N->uses()) {
46188         if (ExpectingFlags)
46189           break;
46190 
46191         switch (U->getOpcode()) {
46192         default:
46193         case ISD::BR_CC:
46194         case ISD::BRCOND:
46195         case ISD::SELECT:
46196           ExpectingFlags = true;
46197           break;
46198         case ISD::CopyToReg:
46199         case ISD::SIGN_EXTEND:
46200         case ISD::ZERO_EXTEND:
46201         case ISD::ANY_EXTEND:
46202           break;
46203         }
46204       }
46205 
46206       if (!ExpectingFlags) {
46207         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
46208         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
46209 
46210         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
46211           X86::CondCode tmp = cc0;
46212           cc0 = cc1;
46213           cc1 = tmp;
46214         }
46215 
46216         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
46217             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
46218           // FIXME: need symbolic constants for these magic numbers.
46219           // See X86ATTInstPrinter.cpp:printSSECC().
46220           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
46221           if (Subtarget.hasAVX512()) {
46222             SDValue FSetCC =
46223                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
46224                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
46225             // Need to fill with zeros to ensure the bitcast will produce zeroes
46226             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
46227             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
46228                                       DAG.getConstant(0, DL, MVT::v16i1),
46229                                       FSetCC, DAG.getIntPtrConstant(0, DL));
46230             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
46231                                       N->getSimpleValueType(0));
46232           }
46233           SDValue OnesOrZeroesF =
46234               DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
46235                           CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
46236 
46237           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
46238           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
46239 
46240           if (is64BitFP && !Subtarget.is64Bit()) {
46241             // On a 32-bit target, we cannot bitcast the 64-bit float to a
46242             // 64-bit integer, since that's not a legal type. Since
46243             // OnesOrZeroesF is all ones or all zeroes, we don't need all the
46244             // bits, but can do this little dance to extract the lowest 32 bits
46245             // and work with those going forward.
46246             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
46247                                            OnesOrZeroesF);
46248             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
46249             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
46250                                         Vector32, DAG.getIntPtrConstant(0, DL));
46251             IntVT = MVT::i32;
46252           }
46253 
46254           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
46255           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
46256                                       DAG.getConstant(1, DL, IntVT));
46257           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
46258                                               ANDed);
46259           return OneBitOfTruth;
46260         }
46261       }
46262     }
46263   }
46264   return SDValue();
46265 }
46266 
46267 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
46268 static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
46269   assert(N->getOpcode() == ISD::AND);
46270 
46271   MVT VT = N->getSimpleValueType(0);
46272   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
46273     return SDValue();
46274 
46275   SDValue X, Y;
46276   SDValue N0 = N->getOperand(0);
46277   SDValue N1 = N->getOperand(1);
46278 
46279   auto GetNot = [&VT, &DAG](SDValue V) {
46280     // Basic X = NOT(Y) detection.
46281     if (SDValue Not = IsNOT(V, DAG))
46282       return Not;
46283     // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
46284     if (V.getOpcode() == X86ISD::VBROADCAST) {
46285       SDValue Src = V.getOperand(0);
46286       EVT SrcVT = Src.getValueType();
46287       if (!SrcVT.isVector())
46288         return SDValue();
46289       if (SDValue Not = IsNOT(Src, DAG))
46290         return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
46291                            DAG.getBitcast(SrcVT, Not));
46292     }
46293     return SDValue();
46294   };
46295 
46296   if (SDValue Not = GetNot(N0)) {
46297     X = Not;
46298     Y = N1;
46299   } else if (SDValue Not = GetNot(N1)) {
46300     X = Not;
46301     Y = N0;
46302   } else
46303     return SDValue();
46304 
46305   X = DAG.getBitcast(VT, X);
46306   Y = DAG.getBitcast(VT, Y);
46307   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
46308 }
46309 
46310 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
46311 // logical operations, like in the example below.
46312 //   or (and (truncate x, truncate y)),
46313 //      (xor (truncate z, build_vector (constants)))
46314 // Given a target type \p VT, we generate
46315 //   or (and x, y), (xor z, zext(build_vector (constants)))
46316 // given x, y and z are of type \p VT. We can do so, if operands are either
46317 // truncates from VT types, the second operand is a vector of constants or can
46318 // be recursively promoted.
46319 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
46320                                      unsigned Depth) {
46321   // Limit recursion to avoid excessive compile times.
46322   if (Depth >= SelectionDAG::MaxRecursionDepth)
46323     return SDValue();
46324 
46325   if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
46326       N->getOpcode() != ISD::OR)
46327     return SDValue();
46328 
46329   SDValue N0 = N->getOperand(0);
46330   SDValue N1 = N->getOperand(1);
46331   SDLoc DL(N);
46332 
46333   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46334   if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
46335     return SDValue();
46336 
46337   if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
46338     N0 = NN0;
46339   else {
46340     // The Left side has to be a trunc.
46341     if (N0.getOpcode() != ISD::TRUNCATE)
46342       return SDValue();
46343 
46344     // The type of the truncated inputs.
46345     if (N0.getOperand(0).getValueType() != VT)
46346       return SDValue();
46347 
46348     N0 = N0.getOperand(0);
46349   }
46350 
46351   if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
46352     N1 = NN1;
46353   else {
46354     // The right side has to be a 'trunc' or a constant vector.
46355     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
46356                     N1.getOperand(0).getValueType() == VT;
46357     if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
46358       return SDValue();
46359 
46360     if (RHSTrunc)
46361       N1 = N1.getOperand(0);
46362     else
46363       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
46364   }
46365 
46366   return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
46367 }
46368 
46369 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
46370 // register. In most cases we actually compare or select YMM-sized registers
46371 // and mixing the two types creates horrible code. This method optimizes
46372 // some of the transition sequences.
46373 // Even with AVX-512 this is still useful for removing casts around logical
46374 // operations on vXi1 mask types.
46375 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
46376                                      const X86Subtarget &Subtarget) {
46377   EVT VT = N->getValueType(0);
46378   assert(VT.isVector() && "Expected vector type");
46379 
46380   SDLoc DL(N);
46381   assert((N->getOpcode() == ISD::ANY_EXTEND ||
46382           N->getOpcode() == ISD::ZERO_EXTEND ||
46383           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
46384 
46385   SDValue Narrow = N->getOperand(0);
46386   EVT NarrowVT = Narrow.getValueType();
46387 
46388   // Generate the wide operation.
46389   SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
46390   if (!Op)
46391     return SDValue();
46392   switch (N->getOpcode()) {
46393   default: llvm_unreachable("Unexpected opcode");
46394   case ISD::ANY_EXTEND:
46395     return Op;
46396   case ISD::ZERO_EXTEND:
46397     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
46398   case ISD::SIGN_EXTEND:
46399     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
46400                        Op, DAG.getValueType(NarrowVT));
46401   }
46402 }
46403 
46404 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
46405   unsigned FPOpcode;
46406   switch (Opcode) {
46407   default: llvm_unreachable("Unexpected input node for FP logic conversion");
46408   case ISD::AND: FPOpcode = X86ISD::FAND; break;
46409   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
46410   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46411   }
46412   return FPOpcode;
46413 }
46414 
46415 /// If both input operands of a logic op are being cast from floating-point
46416 /// types or FP compares, try to convert this into a floating-point logic node
46417 /// to avoid unnecessary moves from SSE to integer registers.
46418 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
46419                                         TargetLowering::DAGCombinerInfo &DCI,
46420                                         const X86Subtarget &Subtarget) {
46421   EVT VT = N->getValueType(0);
46422   SDValue N0 = N->getOperand(0);
46423   SDValue N1 = N->getOperand(1);
46424   SDLoc DL(N);
46425 
46426   if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
46427         (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
46428     return SDValue();
46429 
46430   SDValue N00 = N0.getOperand(0);
46431   SDValue N10 = N1.getOperand(0);
46432   EVT N00Type = N00.getValueType();
46433   EVT N10Type = N10.getValueType();
46434 
46435   // Ensure that both types are the same and are legal scalar fp types.
46436   if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
46437                               (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
46438                               (Subtarget.hasFP16() && N00Type == MVT::f16)))
46439     return SDValue();
46440 
46441   if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
46442     unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
46443     SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
46444     return DAG.getBitcast(VT, FPLogic);
46445   }
46446 
46447   // The vector ISA for FP predicates is incomplete before AVX, so converting
46448   // COMIS* to CMPS* may not be a win before AVX.
46449   // TODO: Check types/predicates to see if they are available with SSE/SSE2.
46450   if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC ||
46451       !N0.hasOneUse() || !N1.hasOneUse())
46452     return SDValue();
46453 
46454   // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
46455   // and vector logic:
46456   // logic (setcc N00, N01), (setcc N10, N11) -->
46457   // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
46458   unsigned NumElts = 128 / N00Type.getSizeInBits();
46459   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
46460   EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46461   SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
46462   SDValue N01 = N0.getOperand(1);
46463   SDValue N11 = N1.getOperand(1);
46464   SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
46465   SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
46466   SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
46467   SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
46468   SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01,
46469                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
46470   SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11,
46471                                 cast<CondCodeSDNode>(N1.getOperand(2))->get());
46472   SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
46473   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
46474 }
46475 
46476 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
46477 // to reduce XMM->GPR traffic.
46478 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
46479   unsigned Opc = N->getOpcode();
46480   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
46481          "Unexpected bit opcode");
46482 
46483   SDValue N0 = N->getOperand(0);
46484   SDValue N1 = N->getOperand(1);
46485 
46486   // Both operands must be single use MOVMSK.
46487   if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
46488       N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
46489     return SDValue();
46490 
46491   SDValue Vec0 = N0.getOperand(0);
46492   SDValue Vec1 = N1.getOperand(0);
46493   EVT VecVT0 = Vec0.getValueType();
46494   EVT VecVT1 = Vec1.getValueType();
46495 
46496   // Both MOVMSK operands must be from vectors of the same size and same element
46497   // size, but its OK for a fp/int diff.
46498   if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
46499       VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
46500     return SDValue();
46501 
46502   SDLoc DL(N);
46503   unsigned VecOpc =
46504       VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
46505   SDValue Result =
46506       DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
46507   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46508 }
46509 
46510 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
46511 // NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
46512 // handles in InstCombine.
46513 static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
46514   unsigned Opc = N->getOpcode();
46515   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
46516          "Unexpected bit opcode");
46517 
46518   SDValue N0 = N->getOperand(0);
46519   SDValue N1 = N->getOperand(1);
46520   EVT VT = N->getValueType(0);
46521 
46522   // Both operands must be single use.
46523   if (!N0.hasOneUse() || !N1.hasOneUse())
46524     return SDValue();
46525 
46526   // Search for matching shifts.
46527   SDValue BC0 = peekThroughOneUseBitcasts(N0);
46528   SDValue BC1 = peekThroughOneUseBitcasts(N1);
46529 
46530   unsigned BCOpc = BC0.getOpcode();
46531   EVT BCVT = BC0.getValueType();
46532   if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
46533     return SDValue();
46534 
46535   switch (BCOpc) {
46536   case X86ISD::VSHLI:
46537   case X86ISD::VSRLI:
46538   case X86ISD::VSRAI: {
46539     if (BC0.getOperand(1) != BC1.getOperand(1))
46540       return SDValue();
46541 
46542     SDLoc DL(N);
46543     SDValue BitOp =
46544         DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
46545     SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
46546     return DAG.getBitcast(VT, Shift);
46547   }
46548   }
46549 
46550   return SDValue();
46551 }
46552 
46553 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
46554 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
46555 /// with a shift-right to eliminate loading the vector constant mask value.
46556 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
46557                                      const X86Subtarget &Subtarget) {
46558   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
46559   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
46560   EVT VT = Op0.getValueType();
46561   if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
46562     return SDValue();
46563 
46564   // Try to convert an "is positive" signbit masking operation into arithmetic
46565   // shift and "andn". This saves a materialization of a -1 vector constant.
46566   // The "is negative" variant should be handled more generally because it only
46567   // requires "and" rather than "andn":
46568   // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
46569   //
46570   // This is limited to the original type to avoid producing even more bitcasts.
46571   // If the bitcasts can't be eliminated, then it is unlikely that this fold
46572   // will be profitable.
46573   if (N->getValueType(0) == VT &&
46574       supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
46575     SDValue X, Y;
46576     if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&
46577         isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {
46578       X = Op1.getOperand(0);
46579       Y = Op0;
46580     } else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&
46581                isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {
46582       X = Op0.getOperand(0);
46583       Y = Op1;
46584     }
46585     if (X && Y) {
46586       SDLoc DL(N);
46587       SDValue Sra =
46588           getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
46589                                      VT.getScalarSizeInBits() - 1, DAG);
46590       return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
46591     }
46592   }
46593 
46594   APInt SplatVal;
46595   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
46596       !SplatVal.isMask())
46597     return SDValue();
46598 
46599   // Don't prevent creation of ANDN.
46600   if (isBitwiseNot(Op0))
46601     return SDValue();
46602 
46603   if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
46604     return SDValue();
46605 
46606   unsigned EltBitWidth = VT.getScalarSizeInBits();
46607   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
46608     return SDValue();
46609 
46610   SDLoc DL(N);
46611   unsigned ShiftVal = SplatVal.countTrailingOnes();
46612   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
46613   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
46614   return DAG.getBitcast(N->getValueType(0), Shift);
46615 }
46616 
46617 // Get the index node from the lowered DAG of a GEP IR instruction with one
46618 // indexing dimension.
46619 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
46620   if (Ld->isIndexed())
46621     return SDValue();
46622 
46623   SDValue Base = Ld->getBasePtr();
46624 
46625   if (Base.getOpcode() != ISD::ADD)
46626     return SDValue();
46627 
46628   SDValue ShiftedIndex = Base.getOperand(0);
46629 
46630   if (ShiftedIndex.getOpcode() != ISD::SHL)
46631     return SDValue();
46632 
46633   return ShiftedIndex.getOperand(0);
46634 
46635 }
46636 
46637 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
46638   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
46639     switch (VT.getSizeInBits()) {
46640     default: return false;
46641     case 64: return Subtarget.is64Bit() ? true : false;
46642     case 32: return true;
46643     }
46644   }
46645   return false;
46646 }
46647 
46648 // This function recognizes cases where X86 bzhi instruction can replace and
46649 // 'and-load' sequence.
46650 // In case of loading integer value from an array of constants which is defined
46651 // as follows:
46652 //
46653 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
46654 //
46655 // then applying a bitwise and on the result with another input.
46656 // It's equivalent to performing bzhi (zero high bits) on the input, with the
46657 // same index of the load.
46658 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
46659                                     const X86Subtarget &Subtarget) {
46660   MVT VT = Node->getSimpleValueType(0);
46661   SDLoc dl(Node);
46662 
46663   // Check if subtarget has BZHI instruction for the node's type
46664   if (!hasBZHI(Subtarget, VT))
46665     return SDValue();
46666 
46667   // Try matching the pattern for both operands.
46668   for (unsigned i = 0; i < 2; i++) {
46669     SDValue N = Node->getOperand(i);
46670     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
46671 
46672      // continue if the operand is not a load instruction
46673     if (!Ld)
46674       return SDValue();
46675 
46676     const Value *MemOp = Ld->getMemOperand()->getValue();
46677 
46678     if (!MemOp)
46679       return SDValue();
46680 
46681     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
46682       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
46683         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
46684 
46685           Constant *Init = GV->getInitializer();
46686           Type *Ty = Init->getType();
46687           if (!isa<ConstantDataArray>(Init) ||
46688               !Ty->getArrayElementType()->isIntegerTy() ||
46689               Ty->getArrayElementType()->getScalarSizeInBits() !=
46690                   VT.getSizeInBits() ||
46691               Ty->getArrayNumElements() >
46692                   Ty->getArrayElementType()->getScalarSizeInBits())
46693             continue;
46694 
46695           // Check if the array's constant elements are suitable to our case.
46696           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
46697           bool ConstantsMatch = true;
46698           for (uint64_t j = 0; j < ArrayElementCount; j++) {
46699             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
46700             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
46701               ConstantsMatch = false;
46702               break;
46703             }
46704           }
46705           if (!ConstantsMatch)
46706             continue;
46707 
46708           // Do the transformation (For 32-bit type):
46709           // -> (and (load arr[idx]), inp)
46710           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
46711           //    that will be replaced with one bzhi instruction.
46712           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
46713           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
46714 
46715           // Get the Node which indexes into the array.
46716           SDValue Index = getIndexFromUnindexedLoad(Ld);
46717           if (!Index)
46718             return SDValue();
46719           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
46720 
46721           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
46722           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
46723 
46724           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
46725           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
46726 
46727           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
46728         }
46729       }
46730     }
46731   }
46732   return SDValue();
46733 }
46734 
46735 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
46736 // Where C is a mask containing the same number of bits as the setcc and
46737 // where the setcc will freely 0 upper bits of k-register. We can replace the
46738 // undef in the concat with 0s and remove the AND. This mainly helps with
46739 // v2i1/v4i1 setcc being casted to scalar.
46740 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
46741                                              const X86Subtarget &Subtarget) {
46742   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
46743 
46744   EVT VT = N->getValueType(0);
46745 
46746   // Make sure this is an AND with constant. We will check the value of the
46747   // constant later.
46748   if (!isa<ConstantSDNode>(N->getOperand(1)))
46749     return SDValue();
46750 
46751   // This is implied by the ConstantSDNode.
46752   assert(!VT.isVector() && "Expected scalar VT!");
46753 
46754   if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
46755       !N->getOperand(0).hasOneUse() ||
46756       !N->getOperand(0).getOperand(0).hasOneUse())
46757     return SDValue();
46758 
46759   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46760   SDValue Src = N->getOperand(0).getOperand(0);
46761   EVT SrcVT = Src.getValueType();
46762   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
46763       !TLI.isTypeLegal(SrcVT))
46764     return SDValue();
46765 
46766   if (Src.getOpcode() != ISD::CONCAT_VECTORS)
46767     return SDValue();
46768 
46769   // We only care about the first subvector of the concat, we expect the
46770   // other subvectors to be ignored due to the AND if we make the change.
46771   SDValue SubVec = Src.getOperand(0);
46772   EVT SubVecVT = SubVec.getValueType();
46773 
46774   // First subvector should be a setcc with a legal result type. The RHS of the
46775   // AND should be a mask with this many bits.
46776   if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
46777       !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
46778     return SDValue();
46779 
46780   EVT SetccVT = SubVec.getOperand(0).getValueType();
46781   if (!TLI.isTypeLegal(SetccVT) ||
46782       !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
46783     return SDValue();
46784 
46785   if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
46786     return SDValue();
46787 
46788   // We passed all the checks. Rebuild the concat_vectors with zeroes
46789   // and cast it back to VT.
46790   SDLoc dl(N);
46791   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
46792                               DAG.getConstant(0, dl, SubVecVT));
46793   Ops[0] = SubVec;
46794   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
46795                                Ops);
46796   return DAG.getBitcast(VT, Concat);
46797 }
46798 
46799 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
46800                           TargetLowering::DAGCombinerInfo &DCI,
46801                           const X86Subtarget &Subtarget) {
46802   SDValue N0 = N->getOperand(0);
46803   SDValue N1 = N->getOperand(1);
46804   EVT VT = N->getValueType(0);
46805   SDLoc dl(N);
46806   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46807 
46808   // If this is SSE1 only convert to FAND to avoid scalarization.
46809   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
46810     return DAG.getBitcast(MVT::v4i32,
46811                           DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
46812                                       DAG.getBitcast(MVT::v4f32, N0),
46813                                       DAG.getBitcast(MVT::v4f32, N1)));
46814   }
46815 
46816   // Use a 32-bit and+zext if upper bits known zero.
46817   if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
46818     APInt HiMask = APInt::getHighBitsSet(64, 32);
46819     if (DAG.MaskedValueIsZero(N1, HiMask) ||
46820         DAG.MaskedValueIsZero(N0, HiMask)) {
46821       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
46822       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
46823       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
46824                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
46825     }
46826   }
46827 
46828   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
46829   // TODO: Support multiple SrcOps.
46830   if (VT == MVT::i1) {
46831     SmallVector<SDValue, 2> SrcOps;
46832     SmallVector<APInt, 2> SrcPartials;
46833     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
46834         SrcOps.size() == 1) {
46835       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
46836       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
46837       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
46838       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
46839         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
46840       if (Mask) {
46841         assert(SrcPartials[0].getBitWidth() == NumElts &&
46842                "Unexpected partial reduction mask");
46843         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
46844         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
46845         return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
46846       }
46847     }
46848   }
46849 
46850   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
46851     return V;
46852 
46853   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
46854     return R;
46855 
46856   if (SDValue R = combineBitOpWithShift(N, DAG))
46857     return R;
46858 
46859   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
46860     return FPLogic;
46861 
46862   if (DCI.isBeforeLegalizeOps())
46863     return SDValue();
46864 
46865   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
46866     return R;
46867 
46868   if (SDValue R = combineAndNotIntoANDNP(N, DAG))
46869     return R;
46870 
46871   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
46872     return ShiftRight;
46873 
46874   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
46875     return R;
46876 
46877   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
46878     // Attempt to recursively combine a bitmask AND with shuffles.
46879     SDValue Op(N, 0);
46880     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46881       return Res;
46882 
46883     // If either operand is a constant mask, then only the elements that aren't
46884     // zero are actually demanded by the other operand.
46885     auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
46886       APInt UndefElts;
46887       SmallVector<APInt> EltBits;
46888       int NumElts = VT.getVectorNumElements();
46889       int EltSizeInBits = VT.getScalarSizeInBits();
46890       if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
46891         return false;
46892 
46893       APInt DemandedBits = APInt::getZero(EltSizeInBits);
46894       APInt DemandedElts = APInt::getZero(NumElts);
46895       for (int I = 0; I != NumElts; ++I)
46896         if (!EltBits[I].isZero()) {
46897           DemandedBits |= EltBits[I];
46898           DemandedElts.setBit(I);
46899         }
46900 
46901       APInt KnownUndef, KnownZero;
46902       return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
46903                                             KnownZero, DCI) ||
46904              TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI);
46905     };
46906     if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
46907       if (N->getOpcode() != ISD::DELETED_NODE)
46908         DCI.AddToWorklist(N);
46909       return SDValue(N, 0);
46910     }
46911   }
46912 
46913   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
46914   if ((VT.getScalarSizeInBits() % 8) == 0 &&
46915       N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46916       isa<ConstantSDNode>(N0.getOperand(1))) {
46917     SDValue BitMask = N1;
46918     SDValue SrcVec = N0.getOperand(0);
46919     EVT SrcVecVT = SrcVec.getValueType();
46920 
46921     // Check that the constant bitmask masks whole bytes.
46922     APInt UndefElts;
46923     SmallVector<APInt, 64> EltBits;
46924     if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
46925         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
46926         llvm::all_of(EltBits, [](const APInt &M) {
46927           return M.isZero() || M.isAllOnes();
46928         })) {
46929       unsigned NumElts = SrcVecVT.getVectorNumElements();
46930       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
46931       unsigned Idx = N0.getConstantOperandVal(1);
46932 
46933       // Create a root shuffle mask from the byte mask and the extracted index.
46934       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
46935       for (unsigned i = 0; i != Scale; ++i) {
46936         if (UndefElts[i])
46937           continue;
46938         int VecIdx = Scale * Idx + i;
46939         ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
46940       }
46941 
46942       if (SDValue Shuffle = combineX86ShufflesRecursively(
46943               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
46944               X86::MaxShuffleCombineDepth,
46945               /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
46946               /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
46947         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
46948                            N0.getOperand(1));
46949     }
46950   }
46951 
46952   return SDValue();
46953 }
46954 
46955 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
46956 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
46957                                      const X86Subtarget &Subtarget) {
46958   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
46959 
46960   MVT VT = N->getSimpleValueType(0);
46961   unsigned EltSizeInBits = VT.getScalarSizeInBits();
46962   if (!VT.isVector() || (EltSizeInBits % 8) != 0)
46963     return SDValue();
46964 
46965   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
46966   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
46967   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
46968     return SDValue();
46969 
46970   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
46971   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
46972   if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
46973         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
46974     return SDValue();
46975 
46976   // Attempt to extract constant byte masks.
46977   APInt UndefElts0, UndefElts1;
46978   SmallVector<APInt, 32> EltBits0, EltBits1;
46979   if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
46980                                      false, false))
46981     return SDValue();
46982   if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
46983                                      false, false))
46984     return SDValue();
46985 
46986   for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
46987     // TODO - add UNDEF elts support.
46988     if (UndefElts0[i] || UndefElts1[i])
46989       return SDValue();
46990     if (EltBits0[i] != ~EltBits1[i])
46991       return SDValue();
46992   }
46993 
46994   SDLoc DL(N);
46995 
46996   if (useVPTERNLOG(Subtarget, VT)) {
46997     // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
46998     // VPTERNLOG is only available as vXi32/64-bit types.
46999     MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
47000     MVT OpVT =
47001         MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
47002     SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
47003     SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
47004     SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
47005     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
47006     SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
47007                                 DAG, Subtarget);
47008     return DAG.getBitcast(VT, Res);
47009   }
47010 
47011   SDValue X = N->getOperand(0);
47012   SDValue Y =
47013       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
47014                   DAG.getBitcast(VT, N1.getOperand(0)));
47015   return DAG.getNode(ISD::OR, DL, VT, X, Y);
47016 }
47017 
47018 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
47019 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
47020   if (N->getOpcode() != ISD::OR)
47021     return false;
47022 
47023   SDValue N0 = N->getOperand(0);
47024   SDValue N1 = N->getOperand(1);
47025 
47026   // Canonicalize AND to LHS.
47027   if (N1.getOpcode() == ISD::AND)
47028     std::swap(N0, N1);
47029 
47030   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
47031   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
47032     return false;
47033 
47034   Mask = N1.getOperand(0);
47035   X = N1.getOperand(1);
47036 
47037   // Check to see if the mask appeared in both the AND and ANDNP.
47038   if (N0.getOperand(0) == Mask)
47039     Y = N0.getOperand(1);
47040   else if (N0.getOperand(1) == Mask)
47041     Y = N0.getOperand(0);
47042   else
47043     return false;
47044 
47045   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
47046   // ANDNP combine allows other combines to happen that prevent matching.
47047   return true;
47048 }
47049 
47050 // Try to fold:
47051 //   (or (and (m, y), (pandn m, x)))
47052 // into:
47053 //   (vselect m, x, y)
47054 // As a special case, try to fold:
47055 //   (or (and (m, (sub 0, x)), (pandn m, x)))
47056 // into:
47057 //   (sub (xor X, M), M)
47058 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
47059                                             const X86Subtarget &Subtarget) {
47060   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
47061 
47062   EVT VT = N->getValueType(0);
47063   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
47064         (VT.is256BitVector() && Subtarget.hasInt256())))
47065     return SDValue();
47066 
47067   SDValue X, Y, Mask;
47068   if (!matchLogicBlend(N, X, Y, Mask))
47069     return SDValue();
47070 
47071   // Validate that X, Y, and Mask are bitcasts, and see through them.
47072   Mask = peekThroughBitcasts(Mask);
47073   X = peekThroughBitcasts(X);
47074   Y = peekThroughBitcasts(Y);
47075 
47076   EVT MaskVT = Mask.getValueType();
47077   unsigned EltBits = MaskVT.getScalarSizeInBits();
47078 
47079   // TODO: Attempt to handle floating point cases as well?
47080   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
47081     return SDValue();
47082 
47083   SDLoc DL(N);
47084 
47085   // Attempt to combine to conditional negate: (sub (xor X, M), M)
47086   if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
47087                                                            DAG, Subtarget))
47088     return Res;
47089 
47090   // PBLENDVB is only available on SSE 4.1.
47091   if (!Subtarget.hasSSE41())
47092     return SDValue();
47093 
47094   // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
47095   if (Subtarget.hasVLX())
47096     return SDValue();
47097 
47098   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
47099 
47100   X = DAG.getBitcast(BlendVT, X);
47101   Y = DAG.getBitcast(BlendVT, Y);
47102   Mask = DAG.getBitcast(BlendVT, Mask);
47103   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
47104   return DAG.getBitcast(VT, Mask);
47105 }
47106 
47107 // Helper function for combineOrCmpEqZeroToCtlzSrl
47108 // Transforms:
47109 //   seteq(cmp x, 0)
47110 //   into:
47111 //   srl(ctlz x), log2(bitsize(x))
47112 // Input pattern is checked by caller.
47113 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
47114   SDValue Cmp = Op.getOperand(1);
47115   EVT VT = Cmp.getOperand(0).getValueType();
47116   unsigned Log2b = Log2_32(VT.getSizeInBits());
47117   SDLoc dl(Op);
47118   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
47119   // The result of the shift is true or false, and on X86, the 32-bit
47120   // encoding of shr and lzcnt is more desirable.
47121   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
47122   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
47123                             DAG.getConstant(Log2b, dl, MVT::i8));
47124   return Scc;
47125 }
47126 
47127 // Try to transform:
47128 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
47129 //   into:
47130 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
47131 // Will also attempt to match more generic cases, eg:
47132 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
47133 // Only applies if the target supports the FastLZCNT feature.
47134 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
47135                                            TargetLowering::DAGCombinerInfo &DCI,
47136                                            const X86Subtarget &Subtarget) {
47137   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
47138     return SDValue();
47139 
47140   auto isORCandidate = [](SDValue N) {
47141     return (N->getOpcode() == ISD::OR && N->hasOneUse());
47142   };
47143 
47144   // Check the zero extend is extending to 32-bit or more. The code generated by
47145   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
47146   // instructions to clear the upper bits.
47147   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
47148       !isORCandidate(N->getOperand(0)))
47149     return SDValue();
47150 
47151   // Check the node matches: setcc(eq, cmp 0)
47152   auto isSetCCCandidate = [](SDValue N) {
47153     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
47154            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
47155            N->getOperand(1).getOpcode() == X86ISD::CMP &&
47156            isNullConstant(N->getOperand(1).getOperand(1)) &&
47157            N->getOperand(1).getValueType().bitsGE(MVT::i32);
47158   };
47159 
47160   SDNode *OR = N->getOperand(0).getNode();
47161   SDValue LHS = OR->getOperand(0);
47162   SDValue RHS = OR->getOperand(1);
47163 
47164   // Save nodes matching or(or, setcc(eq, cmp 0)).
47165   SmallVector<SDNode *, 2> ORNodes;
47166   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
47167           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
47168     ORNodes.push_back(OR);
47169     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
47170     LHS = OR->getOperand(0);
47171     RHS = OR->getOperand(1);
47172   }
47173 
47174   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
47175   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
47176       !isORCandidate(SDValue(OR, 0)))
47177     return SDValue();
47178 
47179   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
47180   // to
47181   // or(srl(ctlz),srl(ctlz)).
47182   // The dag combiner can then fold it into:
47183   // srl(or(ctlz, ctlz)).
47184   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
47185   SDValue Ret, NewRHS;
47186   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
47187     Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
47188 
47189   if (!Ret)
47190     return SDValue();
47191 
47192   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
47193   while (ORNodes.size() > 0) {
47194     OR = ORNodes.pop_back_val();
47195     LHS = OR->getOperand(0);
47196     RHS = OR->getOperand(1);
47197     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
47198     if (RHS->getOpcode() == ISD::OR)
47199       std::swap(LHS, RHS);
47200     NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
47201     if (!NewRHS)
47202       return SDValue();
47203     Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
47204   }
47205 
47206   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
47207 }
47208 
47209 static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
47210                                    SDValue And1_L, SDValue And1_R, SDLoc DL,
47211                                    SelectionDAG &DAG) {
47212   if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
47213     return SDValue();
47214   SDValue NotOp = And0_L->getOperand(0);
47215   if (NotOp == And1_R)
47216     std::swap(And1_R, And1_L);
47217   if (NotOp != And1_L)
47218     return SDValue();
47219 
47220   // (~(NotOp) & And0_R) | (NotOp & And1_R)
47221   // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
47222   EVT VT = And1_L->getValueType(0);
47223   SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
47224   SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
47225   SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
47226   SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
47227   return Xor1;
47228 }
47229 
47230 /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
47231 /// equivalent `((x ^ y) & m) ^ y)` pattern.
47232 /// This is typically a better representation for  targets without a fused
47233 /// "and-not" operation. This function is intended to be called from a
47234 /// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
47235 static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
47236   // Note that masked-merge variants using XOR or ADD expressions are
47237   // normalized to OR by InstCombine so we only check for OR.
47238   assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
47239   SDValue N0 = Node->getOperand(0);
47240   if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
47241     return SDValue();
47242   SDValue N1 = Node->getOperand(1);
47243   if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
47244     return SDValue();
47245 
47246   SDLoc DL(Node);
47247   SDValue N00 = N0->getOperand(0);
47248   SDValue N01 = N0->getOperand(1);
47249   SDValue N10 = N1->getOperand(0);
47250   SDValue N11 = N1->getOperand(1);
47251   if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
47252     return Result;
47253   if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
47254     return Result;
47255   if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
47256     return Result;
47257   if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
47258     return Result;
47259   return SDValue();
47260 }
47261 
47262 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
47263                          TargetLowering::DAGCombinerInfo &DCI,
47264                          const X86Subtarget &Subtarget) {
47265   SDValue N0 = N->getOperand(0);
47266   SDValue N1 = N->getOperand(1);
47267   EVT VT = N->getValueType(0);
47268   SDLoc dl(N);
47269   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47270 
47271   // If this is SSE1 only convert to FOR to avoid scalarization.
47272   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47273     return DAG.getBitcast(MVT::v4i32,
47274                           DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
47275                                       DAG.getBitcast(MVT::v4f32, N0),
47276                                       DAG.getBitcast(MVT::v4f32, N1)));
47277   }
47278 
47279   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
47280   // TODO: Support multiple SrcOps.
47281   if (VT == MVT::i1) {
47282     SmallVector<SDValue, 2> SrcOps;
47283     SmallVector<APInt, 2> SrcPartials;
47284     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
47285         SrcOps.size() == 1) {
47286       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
47287       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
47288       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
47289       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
47290         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
47291       if (Mask) {
47292         assert(SrcPartials[0].getBitWidth() == NumElts &&
47293                "Unexpected partial reduction mask");
47294         SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
47295         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
47296         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
47297         return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
47298       }
47299     }
47300   }
47301 
47302   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47303     return R;
47304 
47305   if (SDValue R = combineBitOpWithShift(N, DAG))
47306     return R;
47307 
47308   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
47309     return FPLogic;
47310 
47311   if (DCI.isBeforeLegalizeOps())
47312     return SDValue();
47313 
47314   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
47315     return R;
47316 
47317   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
47318     return R;
47319 
47320   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
47321     return R;
47322 
47323   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
47324   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
47325   // iff the upper elements of the non-shifted arg are zero.
47326   // KUNPCK require 16+ bool vector elements.
47327   if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
47328     unsigned NumElts = VT.getVectorNumElements();
47329     unsigned HalfElts = NumElts / 2;
47330     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
47331     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
47332         N1.getConstantOperandAPInt(1) == HalfElts &&
47333         DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
47334       return DAG.getNode(
47335           ISD::CONCAT_VECTORS, dl, VT,
47336           extractSubVector(N0, 0, DAG, dl, HalfElts),
47337           extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
47338     }
47339     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
47340         N0.getConstantOperandAPInt(1) == HalfElts &&
47341         DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
47342       return DAG.getNode(
47343           ISD::CONCAT_VECTORS, dl, VT,
47344           extractSubVector(N1, 0, DAG, dl, HalfElts),
47345           extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
47346     }
47347   }
47348 
47349   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47350     // Attempt to recursively combine an OR of shuffles.
47351     SDValue Op(N, 0);
47352     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47353       return Res;
47354 
47355     // If either operand is a constant mask, then only the elements that aren't
47356     // allones are actually demanded by the other operand.
47357     auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
47358       APInt UndefElts;
47359       SmallVector<APInt> EltBits;
47360       int NumElts = VT.getVectorNumElements();
47361       int EltSizeInBits = VT.getScalarSizeInBits();
47362       if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
47363         return false;
47364 
47365       APInt DemandedElts = APInt::getZero(NumElts);
47366       for (int I = 0; I != NumElts; ++I)
47367         if (!EltBits[I].isAllOnes())
47368           DemandedElts.setBit(I);
47369 
47370       APInt KnownUndef, KnownZero;
47371       return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
47372                                             KnownZero, DCI);
47373     };
47374     if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
47375       if (N->getOpcode() != ISD::DELETED_NODE)
47376         DCI.AddToWorklist(N);
47377       return SDValue(N, 0);
47378     }
47379   }
47380 
47381   // We should fold "masked merge" patterns when `andn` is not available.
47382   if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
47383     if (SDValue R = foldMaskedMerge(N, DAG))
47384       return R;
47385 
47386   return SDValue();
47387 }
47388 
47389 /// Try to turn tests against the signbit in the form of:
47390 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
47391 /// into:
47392 ///   SETGT(X, -1)
47393 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
47394   // This is only worth doing if the output type is i8 or i1.
47395   EVT ResultType = N->getValueType(0);
47396   if (ResultType != MVT::i8 && ResultType != MVT::i1)
47397     return SDValue();
47398 
47399   SDValue N0 = N->getOperand(0);
47400   SDValue N1 = N->getOperand(1);
47401 
47402   // We should be performing an xor against a truncated shift.
47403   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
47404     return SDValue();
47405 
47406   // Make sure we are performing an xor against one.
47407   if (!isOneConstant(N1))
47408     return SDValue();
47409 
47410   // SetCC on x86 zero extends so only act on this if it's a logical shift.
47411   SDValue Shift = N0.getOperand(0);
47412   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
47413     return SDValue();
47414 
47415   // Make sure we are truncating from one of i16, i32 or i64.
47416   EVT ShiftTy = Shift.getValueType();
47417   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
47418     return SDValue();
47419 
47420   // Make sure the shift amount extracts the sign bit.
47421   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
47422       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
47423     return SDValue();
47424 
47425   // Create a greater-than comparison against -1.
47426   // N.B. Using SETGE against 0 works but we want a canonical looking
47427   // comparison, using SETGT matches up with what TranslateX86CC.
47428   SDLoc DL(N);
47429   SDValue ShiftOp = Shift.getOperand(0);
47430   EVT ShiftOpTy = ShiftOp.getValueType();
47431   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47432   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
47433                                                *DAG.getContext(), ResultType);
47434   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
47435                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
47436   if (SetCCResultType != ResultType)
47437     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
47438   return Cond;
47439 }
47440 
47441 /// Turn vector tests of the signbit in the form of:
47442 ///   xor (sra X, elt_size(X)-1), -1
47443 /// into:
47444 ///   pcmpgt X, -1
47445 ///
47446 /// This should be called before type legalization because the pattern may not
47447 /// persist after that.
47448 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
47449                                          const X86Subtarget &Subtarget) {
47450   EVT VT = N->getValueType(0);
47451   if (!VT.isSimple())
47452     return SDValue();
47453 
47454   switch (VT.getSimpleVT().SimpleTy) {
47455   default: return SDValue();
47456   case MVT::v16i8:
47457   case MVT::v8i16:
47458   case MVT::v4i32:
47459   case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
47460   case MVT::v32i8:
47461   case MVT::v16i16:
47462   case MVT::v8i32:
47463   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
47464   }
47465 
47466   // There must be a shift right algebraic before the xor, and the xor must be a
47467   // 'not' operation.
47468   SDValue Shift = N->getOperand(0);
47469   SDValue Ones = N->getOperand(1);
47470   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
47471       !ISD::isBuildVectorAllOnes(Ones.getNode()))
47472     return SDValue();
47473 
47474   // The shift should be smearing the sign bit across each vector element.
47475   auto *ShiftAmt =
47476       isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
47477   if (!ShiftAmt ||
47478       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
47479     return SDValue();
47480 
47481   // Create a greater-than comparison against -1. We don't use the more obvious
47482   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
47483   return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
47484 }
47485 
47486 /// Detect patterns of truncation with unsigned saturation:
47487 ///
47488 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
47489 ///   Return the source value x to be truncated or SDValue() if the pattern was
47490 ///   not matched.
47491 ///
47492 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
47493 ///   where C1 >= 0 and C2 is unsigned max of destination type.
47494 ///
47495 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
47496 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
47497 ///
47498 ///   These two patterns are equivalent to:
47499 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
47500 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
47501 ///   pattern was not matched.
47502 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
47503                                  const SDLoc &DL) {
47504   EVT InVT = In.getValueType();
47505 
47506   // Saturation with truncation. We truncate from InVT to VT.
47507   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
47508          "Unexpected types for truncate operation");
47509 
47510   // Match min/max and return limit value as a parameter.
47511   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
47512     if (V.getOpcode() == Opcode &&
47513         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
47514       return V.getOperand(0);
47515     return SDValue();
47516   };
47517 
47518   APInt C1, C2;
47519   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
47520     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
47521     // the element size of the destination type.
47522     if (C2.isMask(VT.getScalarSizeInBits()))
47523       return UMin;
47524 
47525   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
47526     if (MatchMinMax(SMin, ISD::SMAX, C1))
47527       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
47528         return SMin;
47529 
47530   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
47531     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
47532       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
47533           C2.uge(C1)) {
47534         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
47535       }
47536 
47537   return SDValue();
47538 }
47539 
47540 /// Detect patterns of truncation with signed saturation:
47541 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
47542 ///                  signed_max_of_dest_type)) to dest_type)
47543 /// or:
47544 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
47545 ///                  signed_min_of_dest_type)) to dest_type).
47546 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
47547 /// Return the source value to be truncated or SDValue() if the pattern was not
47548 /// matched.
47549 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
47550   unsigned NumDstBits = VT.getScalarSizeInBits();
47551   unsigned NumSrcBits = In.getScalarValueSizeInBits();
47552   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
47553 
47554   auto MatchMinMax = [](SDValue V, unsigned Opcode,
47555                         const APInt &Limit) -> SDValue {
47556     APInt C;
47557     if (V.getOpcode() == Opcode &&
47558         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
47559       return V.getOperand(0);
47560     return SDValue();
47561   };
47562 
47563   APInt SignedMax, SignedMin;
47564   if (MatchPackUS) {
47565     SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
47566     SignedMin = APInt(NumSrcBits, 0);
47567   } else {
47568     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
47569     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
47570   }
47571 
47572   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
47573     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
47574       return SMax;
47575 
47576   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
47577     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
47578       return SMin;
47579 
47580   return SDValue();
47581 }
47582 
47583 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
47584                                       SelectionDAG &DAG,
47585                                       const X86Subtarget &Subtarget) {
47586   if (!Subtarget.hasSSE2() || !VT.isVector())
47587     return SDValue();
47588 
47589   EVT SVT = VT.getVectorElementType();
47590   EVT InVT = In.getValueType();
47591   EVT InSVT = InVT.getVectorElementType();
47592 
47593   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
47594   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
47595   // and concatenate at the same time. Then we can use a final vpmovuswb to
47596   // clip to 0-255.
47597   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
47598       InVT == MVT::v16i32 && VT == MVT::v16i8) {
47599     if (auto USatVal = detectSSatPattern(In, VT, true)) {
47600       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
47601       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
47602                                            DL, DAG, Subtarget);
47603       assert(Mid && "Failed to pack!");
47604       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
47605     }
47606   }
47607 
47608   // vXi32 truncate instructions are available with AVX512F.
47609   // vXi16 truncate instructions are only available with AVX512BW.
47610   // For 256-bit or smaller vectors, we require VLX.
47611   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
47612   // If the result type is 256-bits or larger and we have disable 512-bit
47613   // registers, we should go ahead and use the pack instructions if possible.
47614   bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
47615                        (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
47616                       (InVT.getSizeInBits() > 128) &&
47617                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
47618                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
47619 
47620   if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
47621       VT.getSizeInBits() >= 64 &&
47622       (SVT == MVT::i8 || SVT == MVT::i16) &&
47623       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
47624     if (auto USatVal = detectSSatPattern(In, VT, true)) {
47625       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
47626       // Only do this when the result is at least 64 bits or we'll leaving
47627       // dangling PACKSSDW nodes.
47628       if (SVT == MVT::i8 && InSVT == MVT::i32) {
47629         EVT MidVT = VT.changeVectorElementType(MVT::i16);
47630         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
47631                                              DAG, Subtarget);
47632         assert(Mid && "Failed to pack!");
47633         SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
47634                                            Subtarget);
47635         assert(V && "Failed to pack!");
47636         return V;
47637       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
47638         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
47639                                       Subtarget);
47640     }
47641     if (auto SSatVal = detectSSatPattern(In, VT))
47642       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
47643                                     Subtarget);
47644   }
47645 
47646   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47647   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
47648       Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
47649       (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
47650     unsigned TruncOpc = 0;
47651     SDValue SatVal;
47652     if (auto SSatVal = detectSSatPattern(In, VT)) {
47653       SatVal = SSatVal;
47654       TruncOpc = X86ISD::VTRUNCS;
47655     } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
47656       SatVal = USatVal;
47657       TruncOpc = X86ISD::VTRUNCUS;
47658     }
47659     if (SatVal) {
47660       unsigned ResElts = VT.getVectorNumElements();
47661       // If the input type is less than 512 bits and we don't have VLX, we need
47662       // to widen to 512 bits.
47663       if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
47664         unsigned NumConcats = 512 / InVT.getSizeInBits();
47665         ResElts *= NumConcats;
47666         SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
47667         ConcatOps[0] = SatVal;
47668         InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
47669                                 NumConcats * InVT.getVectorNumElements());
47670         SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
47671       }
47672       // Widen the result if its narrower than 128 bits.
47673       if (ResElts * SVT.getSizeInBits() < 128)
47674         ResElts = 128 / SVT.getSizeInBits();
47675       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
47676       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
47677       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
47678                          DAG.getIntPtrConstant(0, DL));
47679     }
47680   }
47681 
47682   return SDValue();
47683 }
47684 
47685 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
47686 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
47687 /// X86ISD::AVG instruction.
47688 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
47689                                 const X86Subtarget &Subtarget,
47690                                 const SDLoc &DL) {
47691   if (!VT.isVector())
47692     return SDValue();
47693   EVT InVT = In.getValueType();
47694   unsigned NumElems = VT.getVectorNumElements();
47695 
47696   EVT ScalarVT = VT.getVectorElementType();
47697   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
47698     return SDValue();
47699 
47700   // InScalarVT is the intermediate type in AVG pattern and it should be greater
47701   // than the original input type (i8/i16).
47702   EVT InScalarVT = InVT.getVectorElementType();
47703   if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
47704     return SDValue();
47705 
47706   if (!Subtarget.hasSSE2())
47707     return SDValue();
47708 
47709   // Detect the following pattern:
47710   //
47711   //   %1 = zext <N x i8> %a to <N x i32>
47712   //   %2 = zext <N x i8> %b to <N x i32>
47713   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
47714   //   %4 = add nuw nsw <N x i32> %3, %2
47715   //   %5 = lshr <N x i32> %N, <i32 1 x N>
47716   //   %6 = trunc <N x i32> %5 to <N x i8>
47717   //
47718   // In AVX512, the last instruction can also be a trunc store.
47719   if (In.getOpcode() != ISD::SRL)
47720     return SDValue();
47721 
47722   // A lambda checking the given SDValue is a constant vector and each element
47723   // is in the range [Min, Max].
47724   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
47725     return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
47726       return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
47727     });
47728   };
47729 
47730   auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
47731     unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
47732     return MaxActiveBits <= ScalarVT.getSizeInBits();
47733   };
47734 
47735   // Check if each element of the vector is right-shifted by one.
47736   SDValue LHS = In.getOperand(0);
47737   SDValue RHS = In.getOperand(1);
47738   if (!IsConstVectorInRange(RHS, 1, 1))
47739     return SDValue();
47740   if (LHS.getOpcode() != ISD::ADD)
47741     return SDValue();
47742 
47743   // Detect a pattern of a + b + 1 where the order doesn't matter.
47744   SDValue Operands[3];
47745   Operands[0] = LHS.getOperand(0);
47746   Operands[1] = LHS.getOperand(1);
47747 
47748   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47749                        ArrayRef<SDValue> Ops) {
47750     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
47751   };
47752 
47753   auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
47754     for (SDValue &Op : Ops)
47755       if (Op.getValueType() != VT)
47756         Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
47757     // Pad to a power-of-2 vector, split+apply and extract the original vector.
47758     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
47759     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
47760     if (NumElemsPow2 != NumElems) {
47761       for (SDValue &Op : Ops) {
47762         SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
47763         for (unsigned i = 0; i != NumElems; ++i) {
47764           SDValue Idx = DAG.getIntPtrConstant(i, DL);
47765           EltsOfOp[i] =
47766               DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
47767         }
47768         Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
47769       }
47770     }
47771     SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
47772     if (NumElemsPow2 == NumElems)
47773       return Res;
47774     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
47775                        DAG.getIntPtrConstant(0, DL));
47776   };
47777 
47778   // Take care of the case when one of the operands is a constant vector whose
47779   // element is in the range [1, 256].
47780   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
47781       IsZExtLike(Operands[0])) {
47782     // The pattern is detected. Subtract one from the constant vector, then
47783     // demote it and emit X86ISD::AVG instruction.
47784     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
47785     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
47786     return AVGSplitter({Operands[0], Operands[1]});
47787   }
47788 
47789   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
47790   // Match the or case only if its 'add-like' - can be replaced by an add.
47791   auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
47792     if (ISD::ADD == V.getOpcode()) {
47793       Op0 = V.getOperand(0);
47794       Op1 = V.getOperand(1);
47795       return true;
47796     }
47797     if (ISD::ZERO_EXTEND != V.getOpcode())
47798       return false;
47799     V = V.getOperand(0);
47800     if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
47801         !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
47802       return false;
47803     Op0 = V.getOperand(0);
47804     Op1 = V.getOperand(1);
47805     return true;
47806   };
47807 
47808   SDValue Op0, Op1;
47809   if (FindAddLike(Operands[0], Op0, Op1))
47810     std::swap(Operands[0], Operands[1]);
47811   else if (!FindAddLike(Operands[1], Op0, Op1))
47812     return SDValue();
47813   Operands[2] = Op0;
47814   Operands[1] = Op1;
47815 
47816   // Now we have three operands of two additions. Check that one of them is a
47817   // constant vector with ones, and the other two can be promoted from i8/i16.
47818   for (int i = 0; i < 3; ++i) {
47819     if (!IsConstVectorInRange(Operands[i], 1, 1))
47820       continue;
47821     std::swap(Operands[i], Operands[2]);
47822 
47823     // Check if Operands[0] and Operands[1] are results of type promotion.
47824     for (int j = 0; j < 2; ++j)
47825       if (Operands[j].getValueType() != VT)
47826         if (!IsZExtLike(Operands[j]))
47827           return SDValue();
47828 
47829     // The pattern is detected, emit X86ISD::AVG instruction(s).
47830     return AVGSplitter({Operands[0], Operands[1]});
47831   }
47832 
47833   return SDValue();
47834 }
47835 
47836 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
47837                            TargetLowering::DAGCombinerInfo &DCI,
47838                            const X86Subtarget &Subtarget) {
47839   LoadSDNode *Ld = cast<LoadSDNode>(N);
47840   EVT RegVT = Ld->getValueType(0);
47841   EVT MemVT = Ld->getMemoryVT();
47842   SDLoc dl(Ld);
47843   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47844 
47845   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
47846   // into two 16-byte operations. Also split non-temporal aligned loads on
47847   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
47848   ISD::LoadExtType Ext = Ld->getExtensionType();
47849   bool Fast;
47850   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
47851       Ext == ISD::NON_EXTLOAD &&
47852       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
47853         Ld->getAlignment() >= 16) ||
47854        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
47855                                *Ld->getMemOperand(), &Fast) &&
47856         !Fast))) {
47857     unsigned NumElems = RegVT.getVectorNumElements();
47858     if (NumElems < 2)
47859       return SDValue();
47860 
47861     unsigned HalfOffset = 16;
47862     SDValue Ptr1 = Ld->getBasePtr();
47863     SDValue Ptr2 =
47864         DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
47865     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
47866                                   NumElems / 2);
47867     SDValue Load1 =
47868         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
47869                     Ld->getOriginalAlign(),
47870                     Ld->getMemOperand()->getFlags());
47871     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
47872                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
47873                                 Ld->getOriginalAlign(),
47874                                 Ld->getMemOperand()->getFlags());
47875     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
47876                              Load1.getValue(1), Load2.getValue(1));
47877 
47878     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
47879     return DCI.CombineTo(N, NewVec, TF, true);
47880   }
47881 
47882   // Bool vector load - attempt to cast to an integer, as we have good
47883   // (vXiY *ext(vXi1 bitcast(iX))) handling.
47884   if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
47885       RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
47886     unsigned NumElts = RegVT.getVectorNumElements();
47887     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
47888     if (TLI.isTypeLegal(IntVT)) {
47889       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
47890                                     Ld->getPointerInfo(),
47891                                     Ld->getOriginalAlign(),
47892                                     Ld->getMemOperand()->getFlags());
47893       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
47894       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
47895     }
47896   }
47897 
47898   // If we also broadcast this as a subvector to a wider type, then just extract
47899   // the lowest subvector.
47900   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
47901       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
47902     SDValue Ptr = Ld->getBasePtr();
47903     SDValue Chain = Ld->getChain();
47904     for (SDNode *User : Ptr->uses()) {
47905       if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
47906           cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
47907           cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
47908           cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
47909               MemVT.getSizeInBits() &&
47910           !User->hasAnyUseOfValue(1) &&
47911           User->getValueSizeInBits(0).getFixedSize() >
47912               RegVT.getFixedSizeInBits()) {
47913         SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
47914                                            RegVT.getSizeInBits());
47915         Extract = DAG.getBitcast(RegVT, Extract);
47916         return DCI.CombineTo(N, Extract, SDValue(User, 1));
47917       }
47918     }
47919   }
47920 
47921   // Cast ptr32 and ptr64 pointers to the default address space before a load.
47922   unsigned AddrSpace = Ld->getAddressSpace();
47923   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
47924       AddrSpace == X86AS::PTR32_UPTR) {
47925     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
47926     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
47927       SDValue Cast =
47928           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
47929       return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
47930                          Ld->getOriginalAlign(),
47931                          Ld->getMemOperand()->getFlags());
47932     }
47933   }
47934 
47935   return SDValue();
47936 }
47937 
47938 /// If V is a build vector of boolean constants and exactly one of those
47939 /// constants is true, return the operand index of that true element.
47940 /// Otherwise, return -1.
47941 static int getOneTrueElt(SDValue V) {
47942   // This needs to be a build vector of booleans.
47943   // TODO: Checking for the i1 type matches the IR definition for the mask,
47944   // but the mask check could be loosened to i8 or other types. That might
47945   // also require checking more than 'allOnesValue'; eg, the x86 HW
47946   // instructions only require that the MSB is set for each mask element.
47947   // The ISD::MSTORE comments/definition do not specify how the mask operand
47948   // is formatted.
47949   auto *BV = dyn_cast<BuildVectorSDNode>(V);
47950   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
47951     return -1;
47952 
47953   int TrueIndex = -1;
47954   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
47955   for (unsigned i = 0; i < NumElts; ++i) {
47956     const SDValue &Op = BV->getOperand(i);
47957     if (Op.isUndef())
47958       continue;
47959     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
47960     if (!ConstNode)
47961       return -1;
47962     if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
47963       // If we already found a one, this is too many.
47964       if (TrueIndex >= 0)
47965         return -1;
47966       TrueIndex = i;
47967     }
47968   }
47969   return TrueIndex;
47970 }
47971 
47972 /// Given a masked memory load/store operation, return true if it has one mask
47973 /// bit set. If it has one mask bit set, then also return the memory address of
47974 /// the scalar element to load/store, the vector index to insert/extract that
47975 /// scalar element, and the alignment for the scalar memory access.
47976 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
47977                                          SelectionDAG &DAG, SDValue &Addr,
47978                                          SDValue &Index, Align &Alignment,
47979                                          unsigned &Offset) {
47980   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
47981   if (TrueMaskElt < 0)
47982     return false;
47983 
47984   // Get the address of the one scalar element that is specified by the mask
47985   // using the appropriate offset from the base pointer.
47986   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
47987   Offset = 0;
47988   Addr = MaskedOp->getBasePtr();
47989   if (TrueMaskElt != 0) {
47990     Offset = TrueMaskElt * EltVT.getStoreSize();
47991     Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
47992                                     SDLoc(MaskedOp));
47993   }
47994 
47995   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
47996   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
47997                               EltVT.getStoreSize());
47998   return true;
47999 }
48000 
48001 /// If exactly one element of the mask is set for a non-extending masked load,
48002 /// it is a scalar load and vector insert.
48003 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
48004 /// mask have already been optimized in IR, so we don't bother with those here.
48005 static SDValue
48006 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
48007                              TargetLowering::DAGCombinerInfo &DCI,
48008                              const X86Subtarget &Subtarget) {
48009   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
48010   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
48011   // However, some target hooks may need to be added to know when the transform
48012   // is profitable. Endianness would also have to be considered.
48013 
48014   SDValue Addr, VecIndex;
48015   Align Alignment;
48016   unsigned Offset;
48017   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
48018     return SDValue();
48019 
48020   // Load the one scalar element that is specified by the mask using the
48021   // appropriate offset from the base pointer.
48022   SDLoc DL(ML);
48023   EVT VT = ML->getValueType(0);
48024   EVT EltVT = VT.getVectorElementType();
48025 
48026   EVT CastVT = VT;
48027   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
48028     EltVT = MVT::f64;
48029     CastVT = VT.changeVectorElementType(EltVT);
48030   }
48031 
48032   SDValue Load =
48033       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
48034                   ML->getPointerInfo().getWithOffset(Offset),
48035                   Alignment, ML->getMemOperand()->getFlags());
48036 
48037   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
48038 
48039   // Insert the loaded element into the appropriate place in the vector.
48040   SDValue Insert =
48041       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
48042   Insert = DAG.getBitcast(VT, Insert);
48043   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
48044 }
48045 
48046 static SDValue
48047 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
48048                               TargetLowering::DAGCombinerInfo &DCI) {
48049   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
48050   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
48051     return SDValue();
48052 
48053   SDLoc DL(ML);
48054   EVT VT = ML->getValueType(0);
48055 
48056   // If we are loading the first and last elements of a vector, it is safe and
48057   // always faster to load the whole vector. Replace the masked load with a
48058   // vector load and select.
48059   unsigned NumElts = VT.getVectorNumElements();
48060   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
48061   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
48062   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
48063   if (LoadFirstElt && LoadLastElt) {
48064     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
48065                                 ML->getMemOperand());
48066     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
48067                                   ML->getPassThru());
48068     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
48069   }
48070 
48071   // Convert a masked load with a constant mask into a masked load and a select.
48072   // This allows the select operation to use a faster kind of select instruction
48073   // (for example, vblendvps -> vblendps).
48074 
48075   // Don't try this if the pass-through operand is already undefined. That would
48076   // cause an infinite loop because that's what we're about to create.
48077   if (ML->getPassThru().isUndef())
48078     return SDValue();
48079 
48080   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
48081     return SDValue();
48082 
48083   // The new masked load has an undef pass-through operand. The select uses the
48084   // original pass-through operand.
48085   SDValue NewML = DAG.getMaskedLoad(
48086       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
48087       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
48088       ML->getAddressingMode(), ML->getExtensionType());
48089   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
48090                                 ML->getPassThru());
48091 
48092   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
48093 }
48094 
48095 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
48096                                  TargetLowering::DAGCombinerInfo &DCI,
48097                                  const X86Subtarget &Subtarget) {
48098   auto *Mld = cast<MaskedLoadSDNode>(N);
48099 
48100   // TODO: Expanding load with constant mask may be optimized as well.
48101   if (Mld->isExpandingLoad())
48102     return SDValue();
48103 
48104   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
48105     if (SDValue ScalarLoad =
48106             reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
48107       return ScalarLoad;
48108 
48109     // TODO: Do some AVX512 subsets benefit from this transform?
48110     if (!Subtarget.hasAVX512())
48111       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
48112         return Blend;
48113   }
48114 
48115   // If the mask value has been legalized to a non-boolean vector, try to
48116   // simplify ops leading up to it. We only demand the MSB of each lane.
48117   SDValue Mask = Mld->getMask();
48118   if (Mask.getScalarValueSizeInBits() != 1) {
48119     EVT VT = Mld->getValueType(0);
48120     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48121     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
48122     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
48123       if (N->getOpcode() != ISD::DELETED_NODE)
48124         DCI.AddToWorklist(N);
48125       return SDValue(N, 0);
48126     }
48127     if (SDValue NewMask =
48128             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
48129       return DAG.getMaskedLoad(
48130           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
48131           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
48132           Mld->getAddressingMode(), Mld->getExtensionType());
48133   }
48134 
48135   return SDValue();
48136 }
48137 
48138 /// If exactly one element of the mask is set for a non-truncating masked store,
48139 /// it is a vector extract and scalar store.
48140 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
48141 /// mask have already been optimized in IR, so we don't bother with those here.
48142 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
48143                                               SelectionDAG &DAG,
48144                                               const X86Subtarget &Subtarget) {
48145   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
48146   // However, some target hooks may need to be added to know when the transform
48147   // is profitable. Endianness would also have to be considered.
48148 
48149   SDValue Addr, VecIndex;
48150   Align Alignment;
48151   unsigned Offset;
48152   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
48153     return SDValue();
48154 
48155   // Extract the one scalar element that is actually being stored.
48156   SDLoc DL(MS);
48157   SDValue Value = MS->getValue();
48158   EVT VT = Value.getValueType();
48159   EVT EltVT = VT.getVectorElementType();
48160   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
48161     EltVT = MVT::f64;
48162     EVT CastVT = VT.changeVectorElementType(EltVT);
48163     Value = DAG.getBitcast(CastVT, Value);
48164   }
48165   SDValue Extract =
48166       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
48167 
48168   // Store that element at the appropriate offset from the base pointer.
48169   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
48170                       MS->getPointerInfo().getWithOffset(Offset),
48171                       Alignment, MS->getMemOperand()->getFlags());
48172 }
48173 
48174 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
48175                                   TargetLowering::DAGCombinerInfo &DCI,
48176                                   const X86Subtarget &Subtarget) {
48177   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
48178   if (Mst->isCompressingStore())
48179     return SDValue();
48180 
48181   EVT VT = Mst->getValue().getValueType();
48182   SDLoc dl(Mst);
48183   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48184 
48185   if (Mst->isTruncatingStore())
48186     return SDValue();
48187 
48188   if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
48189     return ScalarStore;
48190 
48191   // If the mask value has been legalized to a non-boolean vector, try to
48192   // simplify ops leading up to it. We only demand the MSB of each lane.
48193   SDValue Mask = Mst->getMask();
48194   if (Mask.getScalarValueSizeInBits() != 1) {
48195     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
48196     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
48197       if (N->getOpcode() != ISD::DELETED_NODE)
48198         DCI.AddToWorklist(N);
48199       return SDValue(N, 0);
48200     }
48201     if (SDValue NewMask =
48202             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
48203       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
48204                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
48205                                 Mst->getMemoryVT(), Mst->getMemOperand(),
48206                                 Mst->getAddressingMode());
48207   }
48208 
48209   SDValue Value = Mst->getValue();
48210   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
48211       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
48212                             Mst->getMemoryVT())) {
48213     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
48214                               Mst->getBasePtr(), Mst->getOffset(), Mask,
48215                               Mst->getMemoryVT(), Mst->getMemOperand(),
48216                               Mst->getAddressingMode(), true);
48217   }
48218 
48219   return SDValue();
48220 }
48221 
48222 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
48223                             TargetLowering::DAGCombinerInfo &DCI,
48224                             const X86Subtarget &Subtarget) {
48225   StoreSDNode *St = cast<StoreSDNode>(N);
48226   EVT StVT = St->getMemoryVT();
48227   SDLoc dl(St);
48228   SDValue StoredVal = St->getValue();
48229   EVT VT = StoredVal.getValueType();
48230   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48231 
48232   // Convert a store of vXi1 into a store of iX and a bitcast.
48233   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
48234       VT.getVectorElementType() == MVT::i1) {
48235 
48236     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
48237     StoredVal = DAG.getBitcast(NewVT, StoredVal);
48238 
48239     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48240                         St->getPointerInfo(), St->getOriginalAlign(),
48241                         St->getMemOperand()->getFlags());
48242   }
48243 
48244   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
48245   // This will avoid a copy to k-register.
48246   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
48247       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
48248       StoredVal.getOperand(0).getValueType() == MVT::i8) {
48249     SDValue Val = StoredVal.getOperand(0);
48250     // We must store zeros to the unused bits.
48251     Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
48252     return DAG.getStore(St->getChain(), dl, Val,
48253                         St->getBasePtr(), St->getPointerInfo(),
48254                         St->getOriginalAlign(),
48255                         St->getMemOperand()->getFlags());
48256   }
48257 
48258   // Widen v2i1/v4i1 stores to v8i1.
48259   if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
48260       Subtarget.hasAVX512()) {
48261     unsigned NumConcats = 8 / VT.getVectorNumElements();
48262     // We must store zeros to the unused bits.
48263     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
48264     Ops[0] = StoredVal;
48265     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
48266     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48267                         St->getPointerInfo(), St->getOriginalAlign(),
48268                         St->getMemOperand()->getFlags());
48269   }
48270 
48271   // Turn vXi1 stores of constants into a scalar store.
48272   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
48273        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
48274       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
48275     // If its a v64i1 store without 64-bit support, we need two stores.
48276     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
48277       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
48278                                       StoredVal->ops().slice(0, 32));
48279       Lo = combinevXi1ConstantToInteger(Lo, DAG);
48280       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
48281                                       StoredVal->ops().slice(32, 32));
48282       Hi = combinevXi1ConstantToInteger(Hi, DAG);
48283 
48284       SDValue Ptr0 = St->getBasePtr();
48285       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
48286 
48287       SDValue Ch0 =
48288           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
48289                        St->getOriginalAlign(),
48290                        St->getMemOperand()->getFlags());
48291       SDValue Ch1 =
48292           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
48293                        St->getPointerInfo().getWithOffset(4),
48294                        St->getOriginalAlign(),
48295                        St->getMemOperand()->getFlags());
48296       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
48297     }
48298 
48299     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
48300     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
48301                         St->getPointerInfo(), St->getOriginalAlign(),
48302                         St->getMemOperand()->getFlags());
48303   }
48304 
48305   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
48306   // Sandy Bridge, perform two 16-byte stores.
48307   bool Fast;
48308   if (VT.is256BitVector() && StVT == VT &&
48309       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
48310                              *St->getMemOperand(), &Fast) &&
48311       !Fast) {
48312     unsigned NumElems = VT.getVectorNumElements();
48313     if (NumElems < 2)
48314       return SDValue();
48315 
48316     return splitVectorStore(St, DAG);
48317   }
48318 
48319   // Split under-aligned vector non-temporal stores.
48320   if (St->isNonTemporal() && StVT == VT &&
48321       St->getAlignment() < VT.getStoreSize()) {
48322     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
48323     // vectors or the legalizer can scalarize it to use MOVNTI.
48324     if (VT.is256BitVector() || VT.is512BitVector()) {
48325       unsigned NumElems = VT.getVectorNumElements();
48326       if (NumElems < 2)
48327         return SDValue();
48328       return splitVectorStore(St, DAG);
48329     }
48330 
48331     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
48332     // to use MOVNTI.
48333     if (VT.is128BitVector() && Subtarget.hasSSE2()) {
48334       MVT NTVT = Subtarget.hasSSE4A()
48335                      ? MVT::v2f64
48336                      : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
48337       return scalarizeVectorStore(St, NTVT, DAG);
48338     }
48339   }
48340 
48341   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
48342   // supported, but avx512f is by extending to v16i32 and truncating.
48343   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
48344       St->getValue().getOpcode() == ISD::TRUNCATE &&
48345       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
48346       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
48347       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
48348     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
48349                               St->getValue().getOperand(0));
48350     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
48351                              MVT::v16i8, St->getMemOperand());
48352   }
48353 
48354   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
48355   if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
48356       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
48357        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
48358       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
48359     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
48360     return EmitTruncSStore(IsSigned, St->getChain(),
48361                            dl, StoredVal.getOperand(0), St->getBasePtr(),
48362                            VT, St->getMemOperand(), DAG);
48363   }
48364 
48365   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
48366   if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
48367     auto IsExtractedElement = [](SDValue V) {
48368       if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
48369         V = V.getOperand(0);
48370       unsigned Opc = V.getOpcode();
48371       if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
48372         if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
48373           return V.getOperand(0);
48374       }
48375       return SDValue();
48376     };
48377     if (SDValue Extract = IsExtractedElement(StoredVal)) {
48378       SDValue Trunc = peekThroughOneUseBitcasts(Extract);
48379       if (Trunc.getOpcode() == X86ISD::VTRUNC) {
48380         SDValue Src = Trunc.getOperand(0);
48381         MVT DstVT = Trunc.getSimpleValueType();
48382         MVT SrcVT = Src.getSimpleValueType();
48383         unsigned NumSrcElts = SrcVT.getVectorNumElements();
48384         unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
48385         MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
48386         if (NumTruncBits == VT.getSizeInBits() &&
48387             TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
48388           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
48389                                    TruncVT, St->getMemOperand());
48390         }
48391       }
48392     }
48393   }
48394 
48395   // Optimize trunc store (of multiple scalars) to shuffle and store.
48396   // First, pack all of the elements in one place. Next, store to memory
48397   // in fewer chunks.
48398   if (St->isTruncatingStore() && VT.isVector()) {
48399     // Check if we can detect an AVG pattern from the truncation. If yes,
48400     // replace the trunc store by a normal store with the result of X86ISD::AVG
48401     // instruction.
48402     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
48403       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
48404                                          Subtarget, dl))
48405         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
48406                             St->getPointerInfo(), St->getOriginalAlign(),
48407                             St->getMemOperand()->getFlags());
48408 
48409     if (TLI.isTruncStoreLegal(VT, StVT)) {
48410       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
48411         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
48412                                dl, Val, St->getBasePtr(),
48413                                St->getMemoryVT(), St->getMemOperand(), DAG);
48414       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
48415                                           DAG, dl))
48416         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
48417                                dl, Val, St->getBasePtr(),
48418                                St->getMemoryVT(), St->getMemOperand(), DAG);
48419     }
48420 
48421     return SDValue();
48422   }
48423 
48424   // Cast ptr32 and ptr64 pointers to the default address space before a store.
48425   unsigned AddrSpace = St->getAddressSpace();
48426   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
48427       AddrSpace == X86AS::PTR32_UPTR) {
48428     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
48429     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
48430       SDValue Cast =
48431           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
48432       return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
48433                           St->getPointerInfo(), St->getOriginalAlign(),
48434                           St->getMemOperand()->getFlags(), St->getAAInfo());
48435     }
48436   }
48437 
48438   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
48439   // the FP state in cases where an emms may be missing.
48440   // A preferable solution to the general problem is to figure out the right
48441   // places to insert EMMS.  This qualifies as a quick hack.
48442 
48443   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
48444   if (VT.getSizeInBits() != 64)
48445     return SDValue();
48446 
48447   const Function &F = DAG.getMachineFunction().getFunction();
48448   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
48449   bool F64IsLegal =
48450       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
48451   if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
48452       isa<LoadSDNode>(St->getValue()) &&
48453       cast<LoadSDNode>(St->getValue())->isSimple() &&
48454       St->getChain().hasOneUse() && St->isSimple()) {
48455     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
48456 
48457     if (!ISD::isNormalLoad(Ld))
48458       return SDValue();
48459 
48460     // Avoid the transformation if there are multiple uses of the loaded value.
48461     if (!Ld->hasNUsesOfValue(1, 0))
48462       return SDValue();
48463 
48464     SDLoc LdDL(Ld);
48465     SDLoc StDL(N);
48466     // Lower to a single movq load/store pair.
48467     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
48468                                 Ld->getBasePtr(), Ld->getMemOperand());
48469 
48470     // Make sure new load is placed in same chain order.
48471     DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
48472     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
48473                         St->getMemOperand());
48474   }
48475 
48476   // This is similar to the above case, but here we handle a scalar 64-bit
48477   // integer store that is extracted from a vector on a 32-bit target.
48478   // If we have SSE2, then we can treat it like a floating-point double
48479   // to get past legalization. The execution dependencies fixup pass will
48480   // choose the optimal machine instruction for the store if this really is
48481   // an integer or v2f32 rather than an f64.
48482   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
48483       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
48484     SDValue OldExtract = St->getOperand(1);
48485     SDValue ExtOp0 = OldExtract.getOperand(0);
48486     unsigned VecSize = ExtOp0.getValueSizeInBits();
48487     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
48488     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
48489     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
48490                                      BitCast, OldExtract.getOperand(1));
48491     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
48492                         St->getPointerInfo(), St->getOriginalAlign(),
48493                         St->getMemOperand()->getFlags());
48494   }
48495 
48496   return SDValue();
48497 }
48498 
48499 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
48500                                      TargetLowering::DAGCombinerInfo &DCI,
48501                                      const X86Subtarget &Subtarget) {
48502   auto *St = cast<MemIntrinsicSDNode>(N);
48503 
48504   SDValue StoredVal = N->getOperand(1);
48505   MVT VT = StoredVal.getSimpleValueType();
48506   EVT MemVT = St->getMemoryVT();
48507 
48508   // Figure out which elements we demand.
48509   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
48510   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
48511 
48512   APInt KnownUndef, KnownZero;
48513   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48514   if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
48515                                      KnownZero, DCI)) {
48516     if (N->getOpcode() != ISD::DELETED_NODE)
48517       DCI.AddToWorklist(N);
48518     return SDValue(N, 0);
48519   }
48520 
48521   return SDValue();
48522 }
48523 
48524 /// Return 'true' if this vector operation is "horizontal"
48525 /// and return the operands for the horizontal operation in LHS and RHS.  A
48526 /// horizontal operation performs the binary operation on successive elements
48527 /// of its first operand, then on successive elements of its second operand,
48528 /// returning the resulting values in a vector.  For example, if
48529 ///   A = < float a0, float a1, float a2, float a3 >
48530 /// and
48531 ///   B = < float b0, float b1, float b2, float b3 >
48532 /// then the result of doing a horizontal operation on A and B is
48533 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
48534 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
48535 /// A horizontal-op B, for some already available A and B, and if so then LHS is
48536 /// set to A, RHS to B, and the routine returns 'true'.
48537 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
48538                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
48539                               bool IsCommutative,
48540                               SmallVectorImpl<int> &PostShuffleMask) {
48541   // If either operand is undef, bail out. The binop should be simplified.
48542   if (LHS.isUndef() || RHS.isUndef())
48543     return false;
48544 
48545   // Look for the following pattern:
48546   //   A = < float a0, float a1, float a2, float a3 >
48547   //   B = < float b0, float b1, float b2, float b3 >
48548   // and
48549   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
48550   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
48551   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
48552   // which is A horizontal-op B.
48553 
48554   MVT VT = LHS.getSimpleValueType();
48555   assert((VT.is128BitVector() || VT.is256BitVector()) &&
48556          "Unsupported vector type for horizontal add/sub");
48557   unsigned NumElts = VT.getVectorNumElements();
48558 
48559   auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
48560                         SmallVectorImpl<int> &ShuffleMask) {
48561     bool UseSubVector = false;
48562     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48563         Op.getOperand(0).getValueType().is256BitVector() &&
48564         llvm::isNullConstant(Op.getOperand(1))) {
48565       Op = Op.getOperand(0);
48566       UseSubVector = true;
48567     }
48568     SmallVector<SDValue, 2> SrcOps;
48569     SmallVector<int, 16> SrcMask, ScaledMask;
48570     SDValue BC = peekThroughBitcasts(Op);
48571     if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
48572         !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
48573           return Op.getValueSizeInBits() == BC.getValueSizeInBits();
48574         })) {
48575       resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
48576       if (!UseSubVector && SrcOps.size() <= 2 &&
48577           scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
48578         N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
48579         N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
48580         ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
48581       }
48582       if (UseSubVector && SrcOps.size() == 1 &&
48583           scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
48584         std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
48585         ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
48586         ShuffleMask.assign(Mask.begin(), Mask.end());
48587       }
48588     }
48589   };
48590 
48591   // View LHS in the form
48592   //   LHS = VECTOR_SHUFFLE A, B, LMask
48593   // If LHS is not a shuffle, then pretend it is the identity shuffle:
48594   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
48595   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
48596   SDValue A, B;
48597   SmallVector<int, 16> LMask;
48598   GetShuffle(LHS, A, B, LMask);
48599 
48600   // Likewise, view RHS in the form
48601   //   RHS = VECTOR_SHUFFLE C, D, RMask
48602   SDValue C, D;
48603   SmallVector<int, 16> RMask;
48604   GetShuffle(RHS, C, D, RMask);
48605 
48606   // At least one of the operands should be a vector shuffle.
48607   unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
48608   if (NumShuffles == 0)
48609     return false;
48610 
48611   if (LMask.empty()) {
48612     A = LHS;
48613     for (unsigned i = 0; i != NumElts; ++i)
48614       LMask.push_back(i);
48615   }
48616 
48617   if (RMask.empty()) {
48618     C = RHS;
48619     for (unsigned i = 0; i != NumElts; ++i)
48620       RMask.push_back(i);
48621   }
48622 
48623   // If we have an unary mask, ensure the other op is set to null.
48624   if (isUndefOrInRange(LMask, 0, NumElts))
48625     B = SDValue();
48626   else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
48627     A = SDValue();
48628 
48629   if (isUndefOrInRange(RMask, 0, NumElts))
48630     D = SDValue();
48631   else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
48632     C = SDValue();
48633 
48634   // If A and B occur in reverse order in RHS, then canonicalize by commuting
48635   // RHS operands and shuffle mask.
48636   if (A != C) {
48637     std::swap(C, D);
48638     ShuffleVectorSDNode::commuteMask(RMask);
48639   }
48640   // Check that the shuffles are both shuffling the same vectors.
48641   if (!(A == C && B == D))
48642     return false;
48643 
48644   PostShuffleMask.clear();
48645   PostShuffleMask.append(NumElts, SM_SentinelUndef);
48646 
48647   // LHS and RHS are now:
48648   //   LHS = shuffle A, B, LMask
48649   //   RHS = shuffle A, B, RMask
48650   // Check that the masks correspond to performing a horizontal operation.
48651   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
48652   // so we just repeat the inner loop if this is a 256-bit op.
48653   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
48654   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
48655   unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
48656   assert((NumEltsPer128BitChunk % 2 == 0) &&
48657          "Vector type should have an even number of elements in each lane");
48658   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
48659     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
48660       // Ignore undefined components.
48661       int LIdx = LMask[i + j], RIdx = RMask[i + j];
48662       if (LIdx < 0 || RIdx < 0 ||
48663           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
48664           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
48665         continue;
48666 
48667       // Check that successive odd/even elements are being operated on. If not,
48668       // this is not a horizontal operation.
48669       if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
48670           !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
48671         return false;
48672 
48673       // Compute the post-shuffle mask index based on where the element
48674       // is stored in the HOP result, and where it needs to be moved to.
48675       int Base = LIdx & ~1u;
48676       int Index = ((Base % NumEltsPer128BitChunk) / 2) +
48677                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
48678 
48679       // The  low half of the 128-bit result must choose from A.
48680       // The high half of the 128-bit result must choose from B,
48681       // unless B is undef. In that case, we are always choosing from A.
48682       if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
48683         Index += NumEltsPer64BitChunk;
48684       PostShuffleMask[i + j] = Index;
48685     }
48686   }
48687 
48688   SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
48689   SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
48690 
48691   bool IsIdentityPostShuffle =
48692       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
48693   if (IsIdentityPostShuffle)
48694     PostShuffleMask.clear();
48695 
48696   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
48697   if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
48698       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
48699     return false;
48700 
48701   // If the source nodes are already used in HorizOps then always accept this.
48702   // Shuffle folding should merge these back together.
48703   bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
48704     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
48705   });
48706   bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
48707     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
48708   });
48709   bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
48710 
48711   // Assume a SingleSource HOP if we only shuffle one input and don't need to
48712   // shuffle the result.
48713   if (!ForceHorizOp &&
48714       !shouldUseHorizontalOp(NewLHS == NewRHS &&
48715                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
48716                              DAG, Subtarget))
48717     return false;
48718 
48719   LHS = DAG.getBitcast(VT, NewLHS);
48720   RHS = DAG.getBitcast(VT, NewRHS);
48721   return true;
48722 }
48723 
48724 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
48725 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
48726                                          const X86Subtarget &Subtarget) {
48727   EVT VT = N->getValueType(0);
48728   unsigned Opcode = N->getOpcode();
48729   bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
48730   SmallVector<int, 8> PostShuffleMask;
48731 
48732   switch (Opcode) {
48733   case ISD::FADD:
48734   case ISD::FSUB:
48735     if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
48736         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
48737       SDValue LHS = N->getOperand(0);
48738       SDValue RHS = N->getOperand(1);
48739       auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
48740       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
48741                             PostShuffleMask)) {
48742         SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
48743         if (!PostShuffleMask.empty())
48744           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
48745                                             DAG.getUNDEF(VT), PostShuffleMask);
48746         return HorizBinOp;
48747       }
48748     }
48749     break;
48750   case ISD::ADD:
48751   case ISD::SUB:
48752     if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
48753                                  VT == MVT::v16i16 || VT == MVT::v8i32)) {
48754       SDValue LHS = N->getOperand(0);
48755       SDValue RHS = N->getOperand(1);
48756       auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
48757       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
48758                             PostShuffleMask)) {
48759         auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
48760                                         ArrayRef<SDValue> Ops) {
48761           return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
48762         };
48763         SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
48764                                               {LHS, RHS}, HOpBuilder);
48765         if (!PostShuffleMask.empty())
48766           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
48767                                             DAG.getUNDEF(VT), PostShuffleMask);
48768         return HorizBinOp;
48769       }
48770     }
48771     break;
48772   }
48773 
48774   return SDValue();
48775 }
48776 
48777 //  Try to combine the following nodes
48778 //  t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
48779 //    <i32 -2147483648[float -0.000000e+00]> 0
48780 //  t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
48781 //    <(load 4 from constant-pool)> t0, t29
48782 //  [t30: v16i32 = bitcast t27]
48783 //  t6: v16i32 = xor t7, t27[t30]
48784 //  t11: v16f32 = bitcast t6
48785 //  t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
48786 //  into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
48787 //  t22: v16f32 = bitcast t7
48788 //  t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
48789 //  t24: v32f16 = bitcast t23
48790 static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
48791                                   const X86Subtarget &Subtarget) {
48792   EVT VT = N->getValueType(0);
48793   SDValue LHS = N->getOperand(0);
48794   SDValue RHS = N->getOperand(1);
48795   int CombineOpcode =
48796       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
48797   auto isConjugationConstant = [](const Constant *c) {
48798     if (const auto *CI = dyn_cast<ConstantInt>(c)) {
48799       APInt ConjugationInt32 = APInt(32, 0x80000000, true);
48800       APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
48801       switch (CI->getBitWidth()) {
48802       case 16:
48803         return false;
48804       case 32:
48805         return CI->getValue() == ConjugationInt32;
48806       case 64:
48807         return CI->getValue() == ConjugationInt64;
48808       default:
48809         llvm_unreachable("Unexpected bit width");
48810       }
48811     }
48812     if (const auto *CF = dyn_cast<ConstantFP>(c))
48813       return CF->isNegativeZeroValue();
48814     return false;
48815   };
48816   auto combineConjugation = [&](SDValue &r) {
48817     if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
48818       SDValue XOR = LHS.getOperand(0);
48819       if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
48820         SDValue XORRHS = XOR.getOperand(1);
48821         if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
48822           XORRHS = XORRHS.getOperand(0);
48823         if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
48824             XORRHS.getOperand(1).getNumOperands()) {
48825           ConstantPoolSDNode *CP =
48826               dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
48827           if (CP && isConjugationConstant(CP->getConstVal())) {
48828             SelectionDAG::FlagInserter FlagsInserter(DAG, N);
48829             SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
48830             SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
48831             r = DAG.getBitcast(VT, FCMulC);
48832             return true;
48833           }
48834         }
48835       }
48836     }
48837     return false;
48838   };
48839   SDValue Res;
48840   if (combineConjugation(Res))
48841     return Res;
48842   std::swap(LHS, RHS);
48843   if (combineConjugation(Res))
48844     return Res;
48845   return Res;
48846 }
48847 
48848 //  Try to combine the following nodes:
48849 //  FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
48850 static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
48851                                 const X86Subtarget &Subtarget) {
48852   auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
48853     return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
48854            Flags.hasAllowContract();
48855   };
48856 
48857   auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
48858     return DAG.getTarget().Options.NoSignedZerosFPMath ||
48859            Flags.hasNoSignedZeros();
48860   };
48861   auto IsVectorAllNegativeZero = [](const SDNode *N) {
48862     if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
48863       return false;
48864     assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&
48865            "Unexpected vector type!");
48866     if (ConstantPoolSDNode *CP =
48867             dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
48868       APInt AI = APInt(32, 0x80008000, true);
48869       if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
48870         return CI->getValue() == AI;
48871       if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
48872         return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
48873     }
48874     return false;
48875   };
48876 
48877   if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
48878       !AllowContract(N->getFlags()))
48879     return SDValue();
48880 
48881   EVT VT = N->getValueType(0);
48882   if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
48883     return SDValue();
48884 
48885   SDValue LHS = N->getOperand(0);
48886   SDValue RHS = N->getOperand(1);
48887   bool IsConj;
48888   SDValue FAddOp1, MulOp0, MulOp1;
48889   auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
48890                        &IsVectorAllNegativeZero,
48891                        &HasNoSignedZero](SDValue N) -> bool {
48892     if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
48893       return false;
48894     SDValue Op0 = N.getOperand(0);
48895     unsigned Opcode = Op0.getOpcode();
48896     if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
48897       if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
48898         MulOp0 = Op0.getOperand(0);
48899         MulOp1 = Op0.getOperand(1);
48900         IsConj = Opcode == X86ISD::VFCMULC;
48901         return true;
48902       }
48903       if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
48904           ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
48905             HasNoSignedZero(Op0->getFlags())) ||
48906            IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
48907         MulOp0 = Op0.getOperand(0);
48908         MulOp1 = Op0.getOperand(1);
48909         IsConj = Opcode == X86ISD::VFCMADDC;
48910         return true;
48911       }
48912     }
48913     return false;
48914   };
48915 
48916   if (GetCFmulFrom(LHS))
48917     FAddOp1 = RHS;
48918   else if (GetCFmulFrom(RHS))
48919     FAddOp1 = LHS;
48920   else
48921     return SDValue();
48922 
48923   MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
48924   FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
48925   unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
48926   // FIXME: How do we handle when fast math flags of FADD are different from
48927   // CFMUL's?
48928   SDValue CFmul =
48929       DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
48930   return DAG.getBitcast(VT, CFmul);
48931 }
48932 
48933 /// Do target-specific dag combines on floating-point adds/subs.
48934 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
48935                                const X86Subtarget &Subtarget) {
48936   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
48937     return HOp;
48938 
48939   if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
48940     return COp;
48941 
48942   return SDValue();
48943 }
48944 
48945 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
48946 /// the codegen.
48947 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
48948 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
48949 ///       anything that is guaranteed to be transformed by DAGCombiner.
48950 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
48951                                           const X86Subtarget &Subtarget,
48952                                           const SDLoc &DL) {
48953   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
48954   SDValue Src = N->getOperand(0);
48955   unsigned SrcOpcode = Src.getOpcode();
48956   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48957 
48958   EVT VT = N->getValueType(0);
48959   EVT SrcVT = Src.getValueType();
48960 
48961   auto IsFreeTruncation = [VT](SDValue Op) {
48962     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
48963 
48964     // See if this has been extended from a smaller/equal size to
48965     // the truncation size, allowing a truncation to combine with the extend.
48966     unsigned Opcode = Op.getOpcode();
48967     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
48968          Opcode == ISD::ZERO_EXTEND) &&
48969         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
48970       return true;
48971 
48972     // See if this is a single use constant which can be constant folded.
48973     // NOTE: We don't peek throught bitcasts here because there is currently
48974     // no support for constant folding truncate+bitcast+vector_of_constants. So
48975     // we'll just send up with a truncate on both operands which will
48976     // get turned back into (truncate (binop)) causing an infinite loop.
48977     return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
48978   };
48979 
48980   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
48981     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
48982     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
48983     return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
48984   };
48985 
48986   // Don't combine if the operation has other uses.
48987   if (!Src.hasOneUse())
48988     return SDValue();
48989 
48990   // Only support vector truncation for now.
48991   // TODO: i64 scalar math would benefit as well.
48992   if (!VT.isVector())
48993     return SDValue();
48994 
48995   // In most cases its only worth pre-truncating if we're only facing the cost
48996   // of one truncation.
48997   // i.e. if one of the inputs will constant fold or the input is repeated.
48998   switch (SrcOpcode) {
48999   case ISD::MUL:
49000     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
49001     // better to truncate if we have the chance.
49002     if (SrcVT.getScalarType() == MVT::i64 &&
49003         TLI.isOperationLegal(SrcOpcode, VT) &&
49004         !TLI.isOperationLegal(SrcOpcode, SrcVT))
49005       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
49006     LLVM_FALLTHROUGH;
49007   case ISD::AND:
49008   case ISD::XOR:
49009   case ISD::OR:
49010   case ISD::ADD:
49011   case ISD::SUB: {
49012     SDValue Op0 = Src.getOperand(0);
49013     SDValue Op1 = Src.getOperand(1);
49014     if (TLI.isOperationLegal(SrcOpcode, VT) &&
49015         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
49016       return TruncateArithmetic(Op0, Op1);
49017     break;
49018   }
49019   }
49020 
49021   return SDValue();
49022 }
49023 
49024 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
49025 /// e.g. trunc <8 x i32> X to <8 x i16> -->
49026 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
49027 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
49028 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
49029                                                  const X86Subtarget &Subtarget,
49030                                                  SelectionDAG &DAG) {
49031   SDValue In = N->getOperand(0);
49032   EVT InVT = In.getValueType();
49033   EVT OutVT = N->getValueType(0);
49034 
49035   APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
49036                                     OutVT.getScalarSizeInBits());
49037   In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
49038   return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
49039 }
49040 
49041 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
49042 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
49043                                                  const X86Subtarget &Subtarget,
49044                                                  SelectionDAG &DAG) {
49045   SDValue In = N->getOperand(0);
49046   EVT InVT = In.getValueType();
49047   EVT OutVT = N->getValueType(0);
49048   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
49049                    DAG.getValueType(OutVT));
49050   return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
49051 }
49052 
49053 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
49054 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
49055 /// legalization the truncation will be translated into a BUILD_VECTOR with each
49056 /// element that is extracted from a vector and then truncated, and it is
49057 /// difficult to do this optimization based on them.
49058 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
49059                                        const X86Subtarget &Subtarget) {
49060   EVT OutVT = N->getValueType(0);
49061   if (!OutVT.isVector())
49062     return SDValue();
49063 
49064   SDValue In = N->getOperand(0);
49065   if (!In.getValueType().isSimple())
49066     return SDValue();
49067 
49068   EVT InVT = In.getValueType();
49069   unsigned NumElems = OutVT.getVectorNumElements();
49070 
49071   // AVX512 provides fast truncate ops.
49072   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
49073     return SDValue();
49074 
49075   EVT OutSVT = OutVT.getVectorElementType();
49076   EVT InSVT = InVT.getVectorElementType();
49077   if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
49078         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
49079         NumElems >= 8))
49080     return SDValue();
49081 
49082   // SSSE3's pshufb results in less instructions in the cases below.
49083   if (Subtarget.hasSSSE3() && NumElems == 8) {
49084     if (InSVT == MVT::i16)
49085       return SDValue();
49086     if (InSVT == MVT::i32 &&
49087         (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
49088       return SDValue();
49089   }
49090 
49091   SDLoc DL(N);
49092   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
49093   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
49094   // truncate 2 x v4i32 to v8i16.
49095   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
49096     return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
49097   if (InSVT == MVT::i32)
49098     return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
49099 
49100   return SDValue();
49101 }
49102 
49103 /// This function transforms vector truncation of 'extended sign-bits' or
49104 /// 'extended zero-bits' values.
49105 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
49106 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
49107                                                SelectionDAG &DAG,
49108                                                const X86Subtarget &Subtarget) {
49109   // Requires SSE2.
49110   if (!Subtarget.hasSSE2())
49111     return SDValue();
49112 
49113   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
49114     return SDValue();
49115 
49116   SDValue In = N->getOperand(0);
49117   if (!In.getValueType().isSimple())
49118     return SDValue();
49119 
49120   MVT VT = N->getValueType(0).getSimpleVT();
49121   MVT SVT = VT.getScalarType();
49122 
49123   MVT InVT = In.getValueType().getSimpleVT();
49124   MVT InSVT = InVT.getScalarType();
49125 
49126   // Check we have a truncation suited for PACKSS/PACKUS.
49127   if (!isPowerOf2_32(VT.getVectorNumElements()))
49128     return SDValue();
49129   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
49130     return SDValue();
49131   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
49132     return SDValue();
49133 
49134   // Truncation to sub-128bit vXi32 can be better handled with shuffles.
49135   if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
49136     return SDValue();
49137 
49138   // AVX512 has fast truncate, but if the input is already going to be split,
49139   // there's no harm in trying pack.
49140   if (Subtarget.hasAVX512() &&
49141       !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
49142         InVT.is512BitVector())) {
49143     // PACK should still be worth it for 128-bit vectors if the sources were
49144     // originally concatenated from subvectors.
49145     SmallVector<SDValue> ConcatOps;
49146     if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
49147       return SDValue();
49148   }
49149 
49150   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
49151   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
49152 
49153   // Use PACKUS if the input has zero-bits that extend all the way to the
49154   // packed/truncated value. e.g. masks, zext_in_reg, etc.
49155   KnownBits Known = DAG.computeKnownBits(In);
49156   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
49157   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
49158     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
49159 
49160   // Use PACKSS if the input has sign-bits that extend all the way to the
49161   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
49162   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
49163 
49164   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
49165   // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
49166   // on and combines/simplifications can't then use it.
49167   if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
49168     return SDValue();
49169 
49170   unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
49171   if (NumSignBits > MinSignBits)
49172     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
49173 
49174   // If we have a srl that only generates signbits that we will discard in
49175   // the truncation then we can use PACKSS by converting the srl to a sra.
49176   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
49177   if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
49178     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
49179             In, APInt::getAllOnes(VT.getVectorNumElements()))) {
49180       if (*ShAmt == MinSignBits) {
49181         SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
49182         return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
49183                                       Subtarget);
49184       }
49185     }
49186 
49187   return SDValue();
49188 }
49189 
49190 // Try to form a MULHU or MULHS node by looking for
49191 // (trunc (srl (mul ext, ext), 16))
49192 // TODO: This is X86 specific because we want to be able to handle wide types
49193 // before type legalization. But we can only do it if the vector will be
49194 // legalized via widening/splitting. Type legalization can't handle promotion
49195 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49196 // combiner.
49197 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
49198                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
49199   // First instruction should be a right shift of a multiply.
49200   if (Src.getOpcode() != ISD::SRL ||
49201       Src.getOperand(0).getOpcode() != ISD::MUL)
49202     return SDValue();
49203 
49204   if (!Subtarget.hasSSE2())
49205     return SDValue();
49206 
49207   // Only handle vXi16 types that are at least 128-bits unless they will be
49208   // widened.
49209   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
49210     return SDValue();
49211 
49212   // Input type should be at least vXi32.
49213   EVT InVT = Src.getValueType();
49214   if (InVT.getVectorElementType().getSizeInBits() < 32)
49215     return SDValue();
49216 
49217   // Need a shift by 16.
49218   APInt ShiftAmt;
49219   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
49220       ShiftAmt != 16)
49221     return SDValue();
49222 
49223   SDValue LHS = Src.getOperand(0).getOperand(0);
49224   SDValue RHS = Src.getOperand(0).getOperand(1);
49225 
49226   // Count leading sign/zero bits on both inputs - if there are enough then
49227   // truncation back to vXi16 will be cheap - either as a pack/shuffle
49228   // sequence or using AVX512 truncations. If the inputs are sext/zext then the
49229   // truncations may actually be free by peeking through to the ext source.
49230   auto IsSext = [&DAG](SDValue V) {
49231     return DAG.ComputeMaxSignificantBits(V) <= 16;
49232   };
49233   auto IsZext = [&DAG](SDValue V) {
49234     return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
49235   };
49236 
49237   bool IsSigned = IsSext(LHS) && IsSext(RHS);
49238   bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
49239   if (!IsSigned && !IsUnsigned)
49240     return SDValue();
49241 
49242   // Check if both inputs are extensions, which will be removed by truncation.
49243   bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
49244                          LHS.getOpcode() == ISD::ZERO_EXTEND) &&
49245                         (RHS.getOpcode() == ISD::SIGN_EXTEND ||
49246                          RHS.getOpcode() == ISD::ZERO_EXTEND) &&
49247                         LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
49248                         RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
49249 
49250   // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
49251   // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
49252   // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
49253   // will have to split anyway.
49254   unsigned InSizeInBits = InVT.getSizeInBits();
49255   if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
49256       !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
49257       (InSizeInBits % 16) == 0) {
49258     EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49259                                 InVT.getSizeInBits() / 16);
49260     SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
49261                               DAG.getBitcast(BCVT, RHS));
49262     return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
49263   }
49264 
49265   // Truncate back to source type.
49266   LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
49267   RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
49268 
49269   unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
49270   return DAG.getNode(Opc, DL, VT, LHS, RHS);
49271 }
49272 
49273 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
49274 // from one vector with signed bytes from another vector, adds together
49275 // adjacent pairs of 16-bit products, and saturates the result before
49276 // truncating to 16-bits.
49277 //
49278 // Which looks something like this:
49279 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
49280 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
49281 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
49282                                const X86Subtarget &Subtarget,
49283                                const SDLoc &DL) {
49284   if (!VT.isVector() || !Subtarget.hasSSSE3())
49285     return SDValue();
49286 
49287   unsigned NumElems = VT.getVectorNumElements();
49288   EVT ScalarVT = VT.getVectorElementType();
49289   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
49290     return SDValue();
49291 
49292   SDValue SSatVal = detectSSatPattern(In, VT);
49293   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
49294     return SDValue();
49295 
49296   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
49297   // of multiplies from even/odd elements.
49298   SDValue N0 = SSatVal.getOperand(0);
49299   SDValue N1 = SSatVal.getOperand(1);
49300 
49301   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49302     return SDValue();
49303 
49304   SDValue N00 = N0.getOperand(0);
49305   SDValue N01 = N0.getOperand(1);
49306   SDValue N10 = N1.getOperand(0);
49307   SDValue N11 = N1.getOperand(1);
49308 
49309   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
49310   // Canonicalize zero_extend to LHS.
49311   if (N01.getOpcode() == ISD::ZERO_EXTEND)
49312     std::swap(N00, N01);
49313   if (N11.getOpcode() == ISD::ZERO_EXTEND)
49314     std::swap(N10, N11);
49315 
49316   // Ensure we have a zero_extend and a sign_extend.
49317   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
49318       N01.getOpcode() != ISD::SIGN_EXTEND ||
49319       N10.getOpcode() != ISD::ZERO_EXTEND ||
49320       N11.getOpcode() != ISD::SIGN_EXTEND)
49321     return SDValue();
49322 
49323   // Peek through the extends.
49324   N00 = N00.getOperand(0);
49325   N01 = N01.getOperand(0);
49326   N10 = N10.getOperand(0);
49327   N11 = N11.getOperand(0);
49328 
49329   // Ensure the extend is from vXi8.
49330   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
49331       N01.getValueType().getVectorElementType() != MVT::i8 ||
49332       N10.getValueType().getVectorElementType() != MVT::i8 ||
49333       N11.getValueType().getVectorElementType() != MVT::i8)
49334     return SDValue();
49335 
49336   // All inputs should be build_vectors.
49337   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49338       N01.getOpcode() != ISD::BUILD_VECTOR ||
49339       N10.getOpcode() != ISD::BUILD_VECTOR ||
49340       N11.getOpcode() != ISD::BUILD_VECTOR)
49341     return SDValue();
49342 
49343   // N00/N10 are zero extended. N01/N11 are sign extended.
49344 
49345   // For each element, we need to ensure we have an odd element from one vector
49346   // multiplied by the odd element of another vector and the even element from
49347   // one of the same vectors being multiplied by the even element from the
49348   // other vector. So we need to make sure for each element i, this operator
49349   // is being performed:
49350   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49351   SDValue ZExtIn, SExtIn;
49352   for (unsigned i = 0; i != NumElems; ++i) {
49353     SDValue N00Elt = N00.getOperand(i);
49354     SDValue N01Elt = N01.getOperand(i);
49355     SDValue N10Elt = N10.getOperand(i);
49356     SDValue N11Elt = N11.getOperand(i);
49357     // TODO: Be more tolerant to undefs.
49358     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49359         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49360         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49361         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49362       return SDValue();
49363     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49364     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49365     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49366     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49367     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49368       return SDValue();
49369     unsigned IdxN00 = ConstN00Elt->getZExtValue();
49370     unsigned IdxN01 = ConstN01Elt->getZExtValue();
49371     unsigned IdxN10 = ConstN10Elt->getZExtValue();
49372     unsigned IdxN11 = ConstN11Elt->getZExtValue();
49373     // Add is commutative so indices can be reordered.
49374     if (IdxN00 > IdxN10) {
49375       std::swap(IdxN00, IdxN10);
49376       std::swap(IdxN01, IdxN11);
49377     }
49378     // N0 indices be the even element. N1 indices must be the next odd element.
49379     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49380         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49381       return SDValue();
49382     SDValue N00In = N00Elt.getOperand(0);
49383     SDValue N01In = N01Elt.getOperand(0);
49384     SDValue N10In = N10Elt.getOperand(0);
49385     SDValue N11In = N11Elt.getOperand(0);
49386     // First time we find an input capture it.
49387     if (!ZExtIn) {
49388       ZExtIn = N00In;
49389       SExtIn = N01In;
49390     }
49391     if (ZExtIn != N00In || SExtIn != N01In ||
49392         ZExtIn != N10In || SExtIn != N11In)
49393       return SDValue();
49394   }
49395 
49396   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49397                          ArrayRef<SDValue> Ops) {
49398     // Shrink by adding truncate nodes and let DAGCombine fold with the
49399     // sources.
49400     EVT InVT = Ops[0].getValueType();
49401     assert(InVT.getScalarType() == MVT::i8 &&
49402            "Unexpected scalar element type");
49403     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
49404     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49405                                  InVT.getVectorNumElements() / 2);
49406     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
49407   };
49408   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
49409                           PMADDBuilder);
49410 }
49411 
49412 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
49413                                const X86Subtarget &Subtarget) {
49414   EVT VT = N->getValueType(0);
49415   SDValue Src = N->getOperand(0);
49416   SDLoc DL(N);
49417 
49418   // Attempt to pre-truncate inputs to arithmetic ops instead.
49419   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
49420     return V;
49421 
49422   // Try to detect AVG pattern first.
49423   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
49424     return Avg;
49425 
49426   // Try to detect PMADD
49427   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
49428     return PMAdd;
49429 
49430   // Try to combine truncation with signed/unsigned saturation.
49431   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
49432     return Val;
49433 
49434   // Try to combine PMULHUW/PMULHW for vXi16.
49435   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
49436     return V;
49437 
49438   // The bitcast source is a direct mmx result.
49439   // Detect bitcasts between i32 to x86mmx
49440   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
49441     SDValue BCSrc = Src.getOperand(0);
49442     if (BCSrc.getValueType() == MVT::x86mmx)
49443       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
49444   }
49445 
49446   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
49447   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
49448     return V;
49449 
49450   return combineVectorTruncation(N, DAG, Subtarget);
49451 }
49452 
49453 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
49454                              TargetLowering::DAGCombinerInfo &DCI) {
49455   EVT VT = N->getValueType(0);
49456   SDValue In = N->getOperand(0);
49457   SDLoc DL(N);
49458 
49459   if (auto SSatVal = detectSSatPattern(In, VT))
49460     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
49461   if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
49462     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
49463 
49464   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49465   APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
49466   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
49467     return SDValue(N, 0);
49468 
49469   return SDValue();
49470 }
49471 
49472 /// Returns the negated value if the node \p N flips sign of FP value.
49473 ///
49474 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
49475 /// or FSUB(0, x)
49476 /// AVX512F does not have FXOR, so FNEG is lowered as
49477 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
49478 /// In this case we go though all bitcasts.
49479 /// This also recognizes splat of a negated value and returns the splat of that
49480 /// value.
49481 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
49482   if (N->getOpcode() == ISD::FNEG)
49483     return N->getOperand(0);
49484 
49485   // Don't recurse exponentially.
49486   if (Depth > SelectionDAG::MaxRecursionDepth)
49487     return SDValue();
49488 
49489   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
49490 
49491   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
49492   EVT VT = Op->getValueType(0);
49493 
49494   // Make sure the element size doesn't change.
49495   if (VT.getScalarSizeInBits() != ScalarSize)
49496     return SDValue();
49497 
49498   unsigned Opc = Op.getOpcode();
49499   switch (Opc) {
49500   case ISD::VECTOR_SHUFFLE: {
49501     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
49502     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
49503     if (!Op.getOperand(1).isUndef())
49504       return SDValue();
49505     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
49506       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
49507         return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
49508                                     cast<ShuffleVectorSDNode>(Op)->getMask());
49509     break;
49510   }
49511   case ISD::INSERT_VECTOR_ELT: {
49512     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
49513     // -V, INDEX).
49514     SDValue InsVector = Op.getOperand(0);
49515     SDValue InsVal = Op.getOperand(1);
49516     if (!InsVector.isUndef())
49517       return SDValue();
49518     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
49519       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
49520         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
49521                            NegInsVal, Op.getOperand(2));
49522     break;
49523   }
49524   case ISD::FSUB:
49525   case ISD::XOR:
49526   case X86ISD::FXOR: {
49527     SDValue Op1 = Op.getOperand(1);
49528     SDValue Op0 = Op.getOperand(0);
49529 
49530     // For XOR and FXOR, we want to check if constant
49531     // bits of Op1 are sign bit masks. For FSUB, we
49532     // have to check if constant bits of Op0 are sign
49533     // bit masks and hence we swap the operands.
49534     if (Opc == ISD::FSUB)
49535       std::swap(Op0, Op1);
49536 
49537     APInt UndefElts;
49538     SmallVector<APInt, 16> EltBits;
49539     // Extract constant bits and see if they are all
49540     // sign bit masks. Ignore the undef elements.
49541     if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
49542                                       /* AllowWholeUndefs */ true,
49543                                       /* AllowPartialUndefs */ false)) {
49544       for (unsigned I = 0, E = EltBits.size(); I < E; I++)
49545         if (!UndefElts[I] && !EltBits[I].isSignMask())
49546           return SDValue();
49547 
49548       return peekThroughBitcasts(Op0);
49549     }
49550   }
49551   }
49552 
49553   return SDValue();
49554 }
49555 
49556 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
49557                                 bool NegRes) {
49558   if (NegMul) {
49559     switch (Opcode) {
49560     default: llvm_unreachable("Unexpected opcode");
49561     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
49562     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
49563     case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
49564     case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
49565     case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
49566     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
49567     case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
49568     case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
49569     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
49570     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
49571     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
49572     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
49573     }
49574   }
49575 
49576   if (NegAcc) {
49577     switch (Opcode) {
49578     default: llvm_unreachable("Unexpected opcode");
49579     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
49580     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
49581     case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
49582     case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
49583     case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
49584     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
49585     case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
49586     case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
49587     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
49588     case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
49589     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
49590     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
49591     case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
49592     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
49593     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
49594     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
49595     }
49596   }
49597 
49598   if (NegRes) {
49599     switch (Opcode) {
49600     // For accuracy reason, we never combine fneg and fma under strict FP.
49601     default: llvm_unreachable("Unexpected opcode");
49602     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
49603     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
49604     case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
49605     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
49606     case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
49607     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
49608     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
49609     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
49610     }
49611   }
49612 
49613   return Opcode;
49614 }
49615 
49616 /// Do target-specific dag combines on floating point negations.
49617 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
49618                            TargetLowering::DAGCombinerInfo &DCI,
49619                            const X86Subtarget &Subtarget) {
49620   EVT OrigVT = N->getValueType(0);
49621   SDValue Arg = isFNEG(DAG, N);
49622   if (!Arg)
49623     return SDValue();
49624 
49625   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49626   EVT VT = Arg.getValueType();
49627   EVT SVT = VT.getScalarType();
49628   SDLoc DL(N);
49629 
49630   // Let legalize expand this if it isn't a legal type yet.
49631   if (!TLI.isTypeLegal(VT))
49632     return SDValue();
49633 
49634   // If we're negating a FMUL node on a target with FMA, then we can avoid the
49635   // use of a constant by performing (-0 - A*B) instead.
49636   // FIXME: Check rounding control flags as well once it becomes available.
49637   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
49638       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
49639     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
49640     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
49641                                   Arg.getOperand(1), Zero);
49642     return DAG.getBitcast(OrigVT, NewNode);
49643   }
49644 
49645   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
49646   bool LegalOperations = !DCI.isBeforeLegalizeOps();
49647   if (SDValue NegArg =
49648           TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
49649     return DAG.getBitcast(OrigVT, NegArg);
49650 
49651   return SDValue();
49652 }
49653 
49654 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
49655                                                 bool LegalOperations,
49656                                                 bool ForCodeSize,
49657                                                 NegatibleCost &Cost,
49658                                                 unsigned Depth) const {
49659   // fneg patterns are removable even if they have multiple uses.
49660   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
49661     Cost = NegatibleCost::Cheaper;
49662     return DAG.getBitcast(Op.getValueType(), Arg);
49663   }
49664 
49665   EVT VT = Op.getValueType();
49666   EVT SVT = VT.getScalarType();
49667   unsigned Opc = Op.getOpcode();
49668   SDNodeFlags Flags = Op.getNode()->getFlags();
49669   switch (Opc) {
49670   case ISD::FMA:
49671   case X86ISD::FMSUB:
49672   case X86ISD::FNMADD:
49673   case X86ISD::FNMSUB:
49674   case X86ISD::FMADD_RND:
49675   case X86ISD::FMSUB_RND:
49676   case X86ISD::FNMADD_RND:
49677   case X86ISD::FNMSUB_RND: {
49678     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
49679         !(SVT == MVT::f32 || SVT == MVT::f64) ||
49680         !isOperationLegal(ISD::FMA, VT))
49681       break;
49682 
49683     // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
49684     // if it may have signed zeros.
49685     if (!Flags.hasNoSignedZeros())
49686       break;
49687 
49688     // This is always negatible for free but we might be able to remove some
49689     // extra operand negations as well.
49690     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
49691     for (int i = 0; i != 3; ++i)
49692       NewOps[i] = getCheaperNegatedExpression(
49693           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
49694 
49695     bool NegA = !!NewOps[0];
49696     bool NegB = !!NewOps[1];
49697     bool NegC = !!NewOps[2];
49698     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
49699 
49700     Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
49701                                   : NegatibleCost::Neutral;
49702 
49703     // Fill in the non-negated ops with the original values.
49704     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
49705       if (!NewOps[i])
49706         NewOps[i] = Op.getOperand(i);
49707     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
49708   }
49709   case X86ISD::FRCP:
49710     if (SDValue NegOp0 =
49711             getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
49712                                  ForCodeSize, Cost, Depth + 1))
49713       return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
49714     break;
49715   }
49716 
49717   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
49718                                               ForCodeSize, Cost, Depth);
49719 }
49720 
49721 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
49722                                  const X86Subtarget &Subtarget) {
49723   MVT VT = N->getSimpleValueType(0);
49724   // If we have integer vector types available, use the integer opcodes.
49725   if (!VT.isVector() || !Subtarget.hasSSE2())
49726     return SDValue();
49727 
49728   SDLoc dl(N);
49729 
49730   unsigned IntBits = VT.getScalarSizeInBits();
49731   MVT IntSVT = MVT::getIntegerVT(IntBits);
49732   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
49733 
49734   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
49735   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
49736   unsigned IntOpcode;
49737   switch (N->getOpcode()) {
49738   default: llvm_unreachable("Unexpected FP logic op");
49739   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
49740   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
49741   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
49742   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
49743   }
49744   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
49745   return DAG.getBitcast(VT, IntOp);
49746 }
49747 
49748 
49749 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
49750 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
49751   if (N->getOpcode() != ISD::XOR)
49752     return SDValue();
49753 
49754   SDValue LHS = N->getOperand(0);
49755   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
49756     return SDValue();
49757 
49758   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
49759       X86::CondCode(LHS->getConstantOperandVal(0)));
49760   SDLoc DL(N);
49761   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
49762 }
49763 
49764 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
49765                           TargetLowering::DAGCombinerInfo &DCI,
49766                           const X86Subtarget &Subtarget) {
49767   SDValue N0 = N->getOperand(0);
49768   SDValue N1 = N->getOperand(1);
49769   EVT VT = N->getValueType(0);
49770 
49771   // If this is SSE1 only convert to FXOR to avoid scalarization.
49772   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49773     return DAG.getBitcast(MVT::v4i32,
49774                           DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
49775                                       DAG.getBitcast(MVT::v4f32, N0),
49776                                       DAG.getBitcast(MVT::v4f32, N1)));
49777   }
49778 
49779   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
49780     return Cmp;
49781 
49782   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49783     return R;
49784 
49785   if (SDValue R = combineBitOpWithShift(N, DAG))
49786     return R;
49787 
49788   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49789     return FPLogic;
49790 
49791   if (DCI.isBeforeLegalizeOps())
49792     return SDValue();
49793 
49794   if (SDValue SetCC = foldXor1SetCC(N, DAG))
49795     return SetCC;
49796 
49797   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
49798     return RV;
49799 
49800   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
49801   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49802   if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
49803       N0.getOperand(0).getValueType().isVector() &&
49804       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
49805       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
49806     return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
49807                                          N0.getOperand(0).getValueType()));
49808   }
49809 
49810   // Handle AVX512 mask widening.
49811   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
49812   if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
49813       VT.getVectorElementType() == MVT::i1 &&
49814       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
49815       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
49816     return DAG.getNode(
49817         ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
49818         DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
49819         N0.getOperand(2));
49820   }
49821 
49822   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
49823   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
49824   // TODO: Under what circumstances could this be performed in DAGCombine?
49825   if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
49826       N0.getOperand(0).getOpcode() == N->getOpcode()) {
49827     SDValue TruncExtSrc = N0.getOperand(0);
49828     auto *N1C = dyn_cast<ConstantSDNode>(N1);
49829     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
49830     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
49831       SDLoc DL(N);
49832       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
49833       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
49834       return DAG.getNode(ISD::XOR, DL, VT, LHS,
49835                          DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
49836     }
49837   }
49838 
49839   return combineFneg(N, DAG, DCI, Subtarget);
49840 }
49841 
49842 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
49843                             TargetLowering::DAGCombinerInfo &DCI,
49844                             const X86Subtarget &Subtarget) {
49845   EVT VT = N->getValueType(0);
49846   unsigned NumBits = VT.getSizeInBits();
49847 
49848   // TODO - Constant Folding.
49849 
49850   // Simplify the inputs.
49851   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49852   APInt DemandedMask(APInt::getAllOnes(NumBits));
49853   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
49854     return SDValue(N, 0);
49855 
49856   return SDValue();
49857 }
49858 
49859 static bool isNullFPScalarOrVectorConst(SDValue V) {
49860   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
49861 }
49862 
49863 /// If a value is a scalar FP zero or a vector FP zero (potentially including
49864 /// undefined elements), return a zero constant that may be used to fold away
49865 /// that value. In the case of a vector, the returned constant will not contain
49866 /// undefined elements even if the input parameter does. This makes it suitable
49867 /// to be used as a replacement operand with operations (eg, bitwise-and) where
49868 /// an undef should not propagate.
49869 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
49870                                         const X86Subtarget &Subtarget) {
49871   if (!isNullFPScalarOrVectorConst(V))
49872     return SDValue();
49873 
49874   if (V.getValueType().isVector())
49875     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
49876 
49877   return V;
49878 }
49879 
49880 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
49881                                       const X86Subtarget &Subtarget) {
49882   SDValue N0 = N->getOperand(0);
49883   SDValue N1 = N->getOperand(1);
49884   EVT VT = N->getValueType(0);
49885   SDLoc DL(N);
49886 
49887   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
49888   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
49889         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
49890         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
49891     return SDValue();
49892 
49893   auto isAllOnesConstantFP = [](SDValue V) {
49894     if (V.getSimpleValueType().isVector())
49895       return ISD::isBuildVectorAllOnes(V.getNode());
49896     auto *C = dyn_cast<ConstantFPSDNode>(V);
49897     return C && C->getConstantFPValue()->isAllOnesValue();
49898   };
49899 
49900   // fand (fxor X, -1), Y --> fandn X, Y
49901   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
49902     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
49903 
49904   // fand X, (fxor Y, -1) --> fandn Y, X
49905   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
49906     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
49907 
49908   return SDValue();
49909 }
49910 
49911 /// Do target-specific dag combines on X86ISD::FAND nodes.
49912 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
49913                            const X86Subtarget &Subtarget) {
49914   // FAND(0.0, x) -> 0.0
49915   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
49916     return V;
49917 
49918   // FAND(x, 0.0) -> 0.0
49919   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
49920     return V;
49921 
49922   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
49923     return V;
49924 
49925   return lowerX86FPLogicOp(N, DAG, Subtarget);
49926 }
49927 
49928 /// Do target-specific dag combines on X86ISD::FANDN nodes.
49929 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
49930                             const X86Subtarget &Subtarget) {
49931   // FANDN(0.0, x) -> x
49932   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
49933     return N->getOperand(1);
49934 
49935   // FANDN(x, 0.0) -> 0.0
49936   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
49937     return V;
49938 
49939   return lowerX86FPLogicOp(N, DAG, Subtarget);
49940 }
49941 
49942 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
49943 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
49944                           TargetLowering::DAGCombinerInfo &DCI,
49945                           const X86Subtarget &Subtarget) {
49946   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
49947 
49948   // F[X]OR(0.0, x) -> x
49949   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
49950     return N->getOperand(1);
49951 
49952   // F[X]OR(x, 0.0) -> x
49953   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
49954     return N->getOperand(0);
49955 
49956   if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
49957     return NewVal;
49958 
49959   return lowerX86FPLogicOp(N, DAG, Subtarget);
49960 }
49961 
49962 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
49963 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
49964   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
49965 
49966   // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
49967   if (!DAG.getTarget().Options.NoNaNsFPMath ||
49968       !DAG.getTarget().Options.NoSignedZerosFPMath)
49969     return SDValue();
49970 
49971   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
49972   // into FMINC and FMAXC, which are Commutative operations.
49973   unsigned NewOp = 0;
49974   switch (N->getOpcode()) {
49975     default: llvm_unreachable("unknown opcode");
49976     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
49977     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
49978   }
49979 
49980   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
49981                      N->getOperand(0), N->getOperand(1));
49982 }
49983 
49984 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
49985                                      const X86Subtarget &Subtarget) {
49986   if (Subtarget.useSoftFloat())
49987     return SDValue();
49988 
49989   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49990 
49991   EVT VT = N->getValueType(0);
49992   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
49993         (Subtarget.hasSSE2() && VT == MVT::f64) ||
49994         (Subtarget.hasFP16() && VT == MVT::f16) ||
49995         (VT.isVector() && TLI.isTypeLegal(VT))))
49996     return SDValue();
49997 
49998   SDValue Op0 = N->getOperand(0);
49999   SDValue Op1 = N->getOperand(1);
50000   SDLoc DL(N);
50001   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
50002 
50003   // If we don't have to respect NaN inputs, this is a direct translation to x86
50004   // min/max instructions.
50005   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
50006     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
50007 
50008   // If one of the operands is known non-NaN use the native min/max instructions
50009   // with the non-NaN input as second operand.
50010   if (DAG.isKnownNeverNaN(Op1))
50011     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
50012   if (DAG.isKnownNeverNaN(Op0))
50013     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
50014 
50015   // If we have to respect NaN inputs, this takes at least 3 instructions.
50016   // Favor a library call when operating on a scalar and minimizing code size.
50017   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
50018     return SDValue();
50019 
50020   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
50021                                          VT);
50022 
50023   // There are 4 possibilities involving NaN inputs, and these are the required
50024   // outputs:
50025   //                   Op1
50026   //               Num     NaN
50027   //            ----------------
50028   //       Num  |  Max  |  Op0 |
50029   // Op0        ----------------
50030   //       NaN  |  Op1  |  NaN |
50031   //            ----------------
50032   //
50033   // The SSE FP max/min instructions were not designed for this case, but rather
50034   // to implement:
50035   //   Min = Op1 < Op0 ? Op1 : Op0
50036   //   Max = Op1 > Op0 ? Op1 : Op0
50037   //
50038   // So they always return Op0 if either input is a NaN. However, we can still
50039   // use those instructions for fmaxnum by selecting away a NaN input.
50040 
50041   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
50042   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
50043   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
50044 
50045   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
50046   // are NaN, the NaN value of Op1 is the result.
50047   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
50048 }
50049 
50050 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
50051                                    TargetLowering::DAGCombinerInfo &DCI) {
50052   EVT VT = N->getValueType(0);
50053   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50054 
50055   APInt KnownUndef, KnownZero;
50056   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50057   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50058                                      KnownZero, DCI))
50059     return SDValue(N, 0);
50060 
50061   // Convert a full vector load into vzload when not all bits are needed.
50062   SDValue In = N->getOperand(0);
50063   MVT InVT = In.getSimpleValueType();
50064   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
50065       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
50066     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
50067     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
50068     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
50069     MVT MemVT = MVT::getIntegerVT(NumBits);
50070     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
50071     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
50072       SDLoc dl(N);
50073       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
50074                                     DAG.getBitcast(InVT, VZLoad));
50075       DCI.CombineTo(N, Convert);
50076       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50077       DCI.recursivelyDeleteUnusedNodes(LN);
50078       return SDValue(N, 0);
50079     }
50080   }
50081 
50082   return SDValue();
50083 }
50084 
50085 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
50086                                      TargetLowering::DAGCombinerInfo &DCI) {
50087   bool IsStrict = N->isTargetStrictFPOpcode();
50088   EVT VT = N->getValueType(0);
50089 
50090   // Convert a full vector load into vzload when not all bits are needed.
50091   SDValue In = N->getOperand(IsStrict ? 1 : 0);
50092   MVT InVT = In.getSimpleValueType();
50093   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
50094       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
50095     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
50096     LoadSDNode *LN = cast<LoadSDNode>(In);
50097     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
50098     MVT MemVT = MVT::getFloatingPointVT(NumBits);
50099     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
50100     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
50101       SDLoc dl(N);
50102       if (IsStrict) {
50103         SDValue Convert =
50104             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
50105                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
50106         DCI.CombineTo(N, Convert, Convert.getValue(1));
50107       } else {
50108         SDValue Convert =
50109             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
50110         DCI.CombineTo(N, Convert);
50111       }
50112       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50113       DCI.recursivelyDeleteUnusedNodes(LN);
50114       return SDValue(N, 0);
50115     }
50116   }
50117 
50118   return SDValue();
50119 }
50120 
50121 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
50122 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
50123                             TargetLowering::DAGCombinerInfo &DCI,
50124                             const X86Subtarget &Subtarget) {
50125   MVT VT = N->getSimpleValueType(0);
50126 
50127   // ANDNP(0, x) -> x
50128   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50129     return N->getOperand(1);
50130 
50131   // ANDNP(x, 0) -> 0
50132   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
50133     return DAG.getConstant(0, SDLoc(N), VT);
50134 
50135   // Turn ANDNP back to AND if input is inverted.
50136   if (SDValue Not = IsNOT(N->getOperand(0), DAG))
50137     return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
50138                        N->getOperand(1));
50139 
50140   // Attempt to recursively combine a bitmask ANDNP with shuffles.
50141   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50142     SDValue Op(N, 0);
50143     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50144       return Res;
50145   }
50146 
50147   return SDValue();
50148 }
50149 
50150 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
50151                          TargetLowering::DAGCombinerInfo &DCI) {
50152   SDValue N1 = N->getOperand(1);
50153 
50154   // BT ignores high bits in the bit index operand.
50155   unsigned BitWidth = N1.getValueSizeInBits();
50156   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
50157   if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
50158     if (N->getOpcode() != ISD::DELETED_NODE)
50159       DCI.AddToWorklist(N);
50160     return SDValue(N, 0);
50161   }
50162 
50163   return SDValue();
50164 }
50165 
50166 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
50167                                TargetLowering::DAGCombinerInfo &DCI) {
50168   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
50169   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50170 
50171   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
50172     APInt KnownUndef, KnownZero;
50173     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50174     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
50175     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
50176                                        DCI)) {
50177       if (N->getOpcode() != ISD::DELETED_NODE)
50178         DCI.AddToWorklist(N);
50179       return SDValue(N, 0);
50180     }
50181 
50182     // Convert a full vector load into vzload when not all bits are needed.
50183     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
50184       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
50185       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
50186         SDLoc dl(N);
50187         if (IsStrict) {
50188           SDValue Convert = DAG.getNode(
50189               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
50190               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
50191           DCI.CombineTo(N, Convert, Convert.getValue(1));
50192         } else {
50193           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
50194                                         DAG.getBitcast(MVT::v8i16, VZLoad));
50195           DCI.CombineTo(N, Convert);
50196         }
50197 
50198         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
50199         DCI.recursivelyDeleteUnusedNodes(LN);
50200         return SDValue(N, 0);
50201       }
50202     }
50203   }
50204 
50205   return SDValue();
50206 }
50207 
50208 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
50209 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
50210   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
50211 
50212   EVT DstVT = N->getValueType(0);
50213 
50214   SDValue N0 = N->getOperand(0);
50215   SDValue N1 = N->getOperand(1);
50216   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
50217 
50218   if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
50219     return SDValue();
50220 
50221   // Look through single use any_extends / truncs.
50222   SDValue IntermediateBitwidthOp;
50223   if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
50224       N0.hasOneUse()) {
50225     IntermediateBitwidthOp = N0;
50226     N0 = N0.getOperand(0);
50227   }
50228 
50229   // See if we have a single use cmov.
50230   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
50231     return SDValue();
50232 
50233   SDValue CMovOp0 = N0.getOperand(0);
50234   SDValue CMovOp1 = N0.getOperand(1);
50235 
50236   // Make sure both operands are constants.
50237   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
50238       !isa<ConstantSDNode>(CMovOp1.getNode()))
50239     return SDValue();
50240 
50241   SDLoc DL(N);
50242 
50243   // If we looked through an any_extend/trunc above, add one to the constants.
50244   if (IntermediateBitwidthOp) {
50245     unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
50246     CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
50247     CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
50248   }
50249 
50250   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
50251   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
50252 
50253   EVT CMovVT = DstVT;
50254   // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
50255   if (DstVT == MVT::i16) {
50256     CMovVT = MVT::i32;
50257     CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
50258     CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
50259   }
50260 
50261   SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
50262                              N0.getOperand(2), N0.getOperand(3));
50263 
50264   if (CMovVT != DstVT)
50265     CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
50266 
50267   return CMov;
50268 }
50269 
50270 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
50271                                       const X86Subtarget &Subtarget) {
50272   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
50273 
50274   if (SDValue V = combineSextInRegCmov(N, DAG))
50275     return V;
50276 
50277   EVT VT = N->getValueType(0);
50278   SDValue N0 = N->getOperand(0);
50279   SDValue N1 = N->getOperand(1);
50280   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
50281   SDLoc dl(N);
50282 
50283   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
50284   // both SSE and AVX2 since there is no sign-extended shift right
50285   // operation on a vector with 64-bit elements.
50286   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
50287   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
50288   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
50289                            N0.getOpcode() == ISD::SIGN_EXTEND)) {
50290     SDValue N00 = N0.getOperand(0);
50291 
50292     // EXTLOAD has a better solution on AVX2,
50293     // it may be replaced with X86ISD::VSEXT node.
50294     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
50295       if (!ISD::isNormalLoad(N00.getNode()))
50296         return SDValue();
50297 
50298     // Attempt to promote any comparison mask ops before moving the
50299     // SIGN_EXTEND_INREG in the way.
50300     if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
50301       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
50302 
50303     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
50304       SDValue Tmp =
50305           DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
50306       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
50307     }
50308   }
50309   return SDValue();
50310 }
50311 
50312 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
50313 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
50314 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
50315 /// opportunities to combine math ops, use an LEA, or use a complex addressing
50316 /// mode. This can eliminate extend, add, and shift instructions.
50317 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
50318                                    const X86Subtarget &Subtarget) {
50319   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
50320       Ext->getOpcode() != ISD::ZERO_EXTEND)
50321     return SDValue();
50322 
50323   // TODO: This should be valid for other integer types.
50324   EVT VT = Ext->getValueType(0);
50325   if (VT != MVT::i64)
50326     return SDValue();
50327 
50328   SDValue Add = Ext->getOperand(0);
50329   if (Add.getOpcode() != ISD::ADD)
50330     return SDValue();
50331 
50332   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
50333   bool NSW = Add->getFlags().hasNoSignedWrap();
50334   bool NUW = Add->getFlags().hasNoUnsignedWrap();
50335 
50336   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
50337   // into the 'zext'
50338   if ((Sext && !NSW) || (!Sext && !NUW))
50339     return SDValue();
50340 
50341   // Having a constant operand to the 'add' ensures that we are not increasing
50342   // the instruction count because the constant is extended for free below.
50343   // A constant operand can also become the displacement field of an LEA.
50344   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
50345   if (!AddOp1)
50346     return SDValue();
50347 
50348   // Don't make the 'add' bigger if there's no hope of combining it with some
50349   // other 'add' or 'shl' instruction.
50350   // TODO: It may be profitable to generate simpler LEA instructions in place
50351   // of single 'add' instructions, but the cost model for selecting an LEA
50352   // currently has a high threshold.
50353   bool HasLEAPotential = false;
50354   for (auto *User : Ext->uses()) {
50355     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
50356       HasLEAPotential = true;
50357       break;
50358     }
50359   }
50360   if (!HasLEAPotential)
50361     return SDValue();
50362 
50363   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
50364   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
50365   SDValue AddOp0 = Add.getOperand(0);
50366   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
50367   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
50368 
50369   // The wider add is guaranteed to not wrap because both operands are
50370   // sign-extended.
50371   SDNodeFlags Flags;
50372   Flags.setNoSignedWrap(NSW);
50373   Flags.setNoUnsignedWrap(NUW);
50374   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
50375 }
50376 
50377 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
50378 // operands and the result of CMOV is not used anywhere else - promote CMOV
50379 // itself instead of promoting its result. This could be beneficial, because:
50380 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
50381 //        (or more) pseudo-CMOVs only when they go one-after-another and
50382 //        getting rid of result extension code after CMOV will help that.
50383 //     2) Promotion of constant CMOV arguments is free, hence the
50384 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
50385 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
50386 //        promotion is also good in terms of code-size.
50387 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
50388 //         promotion).
50389 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
50390   SDValue CMovN = Extend->getOperand(0);
50391   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
50392     return SDValue();
50393 
50394   EVT TargetVT = Extend->getValueType(0);
50395   unsigned ExtendOpcode = Extend->getOpcode();
50396   SDLoc DL(Extend);
50397 
50398   EVT VT = CMovN.getValueType();
50399   SDValue CMovOp0 = CMovN.getOperand(0);
50400   SDValue CMovOp1 = CMovN.getOperand(1);
50401 
50402   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
50403       !isa<ConstantSDNode>(CMovOp1.getNode()))
50404     return SDValue();
50405 
50406   // Only extend to i32 or i64.
50407   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
50408     return SDValue();
50409 
50410   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
50411   // are free.
50412   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
50413     return SDValue();
50414 
50415   // If this a zero extend to i64, we should only extend to i32 and use a free
50416   // zero extend to finish.
50417   EVT ExtendVT = TargetVT;
50418   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
50419     ExtendVT = MVT::i32;
50420 
50421   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
50422   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
50423 
50424   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
50425                             CMovN.getOperand(2), CMovN.getOperand(3));
50426 
50427   // Finish extending if needed.
50428   if (ExtendVT != TargetVT)
50429     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
50430 
50431   return Res;
50432 }
50433 
50434 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
50435 // This is more or less the reverse of combineBitcastvxi1.
50436 static SDValue
50437 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
50438                                TargetLowering::DAGCombinerInfo &DCI,
50439                                const X86Subtarget &Subtarget) {
50440   unsigned Opcode = N->getOpcode();
50441   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
50442       Opcode != ISD::ANY_EXTEND)
50443     return SDValue();
50444   if (!DCI.isBeforeLegalizeOps())
50445     return SDValue();
50446   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
50447     return SDValue();
50448 
50449   SDValue N0 = N->getOperand(0);
50450   EVT VT = N->getValueType(0);
50451   EVT SVT = VT.getScalarType();
50452   EVT InSVT = N0.getValueType().getScalarType();
50453   unsigned EltSizeInBits = SVT.getSizeInBits();
50454 
50455   // Input type must be extending a bool vector (bit-casted from a scalar
50456   // integer) to legal integer types.
50457   if (!VT.isVector())
50458     return SDValue();
50459   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
50460     return SDValue();
50461   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
50462     return SDValue();
50463 
50464   SDValue N00 = N0.getOperand(0);
50465   EVT SclVT = N0.getOperand(0).getValueType();
50466   if (!SclVT.isScalarInteger())
50467     return SDValue();
50468 
50469   SDLoc DL(N);
50470   SDValue Vec;
50471   SmallVector<int, 32> ShuffleMask;
50472   unsigned NumElts = VT.getVectorNumElements();
50473   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
50474 
50475   // Broadcast the scalar integer to the vector elements.
50476   if (NumElts > EltSizeInBits) {
50477     // If the scalar integer is greater than the vector element size, then we
50478     // must split it down into sub-sections for broadcasting. For example:
50479     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
50480     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
50481     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
50482     unsigned Scale = NumElts / EltSizeInBits;
50483     EVT BroadcastVT =
50484         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
50485     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
50486     Vec = DAG.getBitcast(VT, Vec);
50487 
50488     for (unsigned i = 0; i != Scale; ++i)
50489       ShuffleMask.append(EltSizeInBits, i);
50490     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
50491   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
50492              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
50493     // If we have register broadcast instructions, use the scalar size as the
50494     // element type for the shuffle. Then cast to the wider element type. The
50495     // widened bits won't be used, and this might allow the use of a broadcast
50496     // load.
50497     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
50498     unsigned Scale = EltSizeInBits / NumElts;
50499     EVT BroadcastVT =
50500         EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
50501     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
50502     ShuffleMask.append(NumElts * Scale, 0);
50503     Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
50504     Vec = DAG.getBitcast(VT, Vec);
50505   } else {
50506     // For smaller scalar integers, we can simply any-extend it to the vector
50507     // element size (we don't care about the upper bits) and broadcast it to all
50508     // elements.
50509     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
50510     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
50511     ShuffleMask.append(NumElts, 0);
50512     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
50513   }
50514 
50515   // Now, mask the relevant bit in each element.
50516   SmallVector<SDValue, 32> Bits;
50517   for (unsigned i = 0; i != NumElts; ++i) {
50518     int BitIdx = (i % EltSizeInBits);
50519     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
50520     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
50521   }
50522   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
50523   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
50524 
50525   // Compare against the bitmask and extend the result.
50526   EVT CCVT = VT.changeVectorElementType(MVT::i1);
50527   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
50528   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
50529 
50530   // For SEXT, this is now done, otherwise shift the result down for
50531   // zero-extension.
50532   if (Opcode == ISD::SIGN_EXTEND)
50533     return Vec;
50534   return DAG.getNode(ISD::SRL, DL, VT, Vec,
50535                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
50536 }
50537 
50538 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
50539 // result type.
50540 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
50541                                const X86Subtarget &Subtarget) {
50542   SDValue N0 = N->getOperand(0);
50543   EVT VT = N->getValueType(0);
50544   SDLoc dl(N);
50545 
50546   // Only do this combine with AVX512 for vector extends.
50547   if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
50548     return SDValue();
50549 
50550   // Only combine legal element types.
50551   EVT SVT = VT.getVectorElementType();
50552   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
50553       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
50554     return SDValue();
50555 
50556   // We don't have CMPP Instruction for vxf16
50557   if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
50558     return SDValue();
50559   // We can only do this if the vector size in 256 bits or less.
50560   unsigned Size = VT.getSizeInBits();
50561   if (Size > 256 && Subtarget.useAVX512Regs())
50562     return SDValue();
50563 
50564   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
50565   // that's the only integer compares with we have.
50566   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50567   if (ISD::isUnsignedIntSetCC(CC))
50568     return SDValue();
50569 
50570   // Only do this combine if the extension will be fully consumed by the setcc.
50571   EVT N00VT = N0.getOperand(0).getValueType();
50572   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
50573   if (Size != MatchingVecType.getSizeInBits())
50574     return SDValue();
50575 
50576   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
50577 
50578   if (N->getOpcode() == ISD::ZERO_EXTEND)
50579     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
50580 
50581   return Res;
50582 }
50583 
50584 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
50585                            TargetLowering::DAGCombinerInfo &DCI,
50586                            const X86Subtarget &Subtarget) {
50587   SDValue N0 = N->getOperand(0);
50588   EVT VT = N->getValueType(0);
50589   SDLoc DL(N);
50590 
50591   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
50592   if (!DCI.isBeforeLegalizeOps() &&
50593       N0.getOpcode() == X86ISD::SETCC_CARRY) {
50594     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
50595                                  N0->getOperand(1));
50596     bool ReplaceOtherUses = !N0.hasOneUse();
50597     DCI.CombineTo(N, Setcc);
50598     // Replace other uses with a truncate of the widened setcc_carry.
50599     if (ReplaceOtherUses) {
50600       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
50601                                   N0.getValueType(), Setcc);
50602       DCI.CombineTo(N0.getNode(), Trunc);
50603     }
50604 
50605     return SDValue(N, 0);
50606   }
50607 
50608   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
50609     return NewCMov;
50610 
50611   if (!DCI.isBeforeLegalizeOps())
50612     return SDValue();
50613 
50614   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
50615     return V;
50616 
50617   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
50618     return V;
50619 
50620   if (VT.isVector()) {
50621     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
50622       return R;
50623 
50624     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
50625       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
50626   }
50627 
50628   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
50629     return NewAdd;
50630 
50631   return SDValue();
50632 }
50633 
50634 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
50635                           TargetLowering::DAGCombinerInfo &DCI,
50636                           const X86Subtarget &Subtarget) {
50637   SDLoc dl(N);
50638   EVT VT = N->getValueType(0);
50639   bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
50640 
50641   // Let legalize expand this if it isn't a legal type yet.
50642   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50643   if (!TLI.isTypeLegal(VT))
50644     return SDValue();
50645 
50646   SDValue A = N->getOperand(IsStrict ? 1 : 0);
50647   SDValue B = N->getOperand(IsStrict ? 2 : 1);
50648   SDValue C = N->getOperand(IsStrict ? 3 : 2);
50649 
50650   // If the operation allows fast-math and the target does not support FMA,
50651   // split this into mul+add to avoid libcall(s).
50652   SDNodeFlags Flags = N->getFlags();
50653   if (!IsStrict && Flags.hasAllowReassociation() &&
50654       TLI.isOperationExpand(ISD::FMA, VT)) {
50655     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
50656     return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
50657   }
50658 
50659   EVT ScalarVT = VT.getScalarType();
50660   if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
50661        !Subtarget.hasAnyFMA()) &&
50662       !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
50663     return SDValue();
50664 
50665   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
50666     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
50667     bool LegalOperations = !DCI.isBeforeLegalizeOps();
50668     if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
50669                                                        CodeSize)) {
50670       V = NegV;
50671       return true;
50672     }
50673     // Look through extract_vector_elts. If it comes from an FNEG, create a
50674     // new extract from the FNEG input.
50675     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50676         isNullConstant(V.getOperand(1))) {
50677       SDValue Vec = V.getOperand(0);
50678       if (SDValue NegV = TLI.getCheaperNegatedExpression(
50679               Vec, DAG, LegalOperations, CodeSize)) {
50680         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
50681                         NegV, V.getOperand(1));
50682         return true;
50683       }
50684     }
50685 
50686     return false;
50687   };
50688 
50689   // Do not convert the passthru input of scalar intrinsics.
50690   // FIXME: We could allow negations of the lower element only.
50691   bool NegA = invertIfNegative(A);
50692   bool NegB = invertIfNegative(B);
50693   bool NegC = invertIfNegative(C);
50694 
50695   if (!NegA && !NegB && !NegC)
50696     return SDValue();
50697 
50698   unsigned NewOpcode =
50699       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
50700 
50701   // Propagate fast-math-flags to new FMA node.
50702   SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
50703   if (IsStrict) {
50704     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
50705     return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
50706                        {N->getOperand(0), A, B, C});
50707   } else {
50708     if (N->getNumOperands() == 4)
50709       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
50710     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
50711   }
50712 }
50713 
50714 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
50715 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
50716 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
50717                                TargetLowering::DAGCombinerInfo &DCI) {
50718   SDLoc dl(N);
50719   EVT VT = N->getValueType(0);
50720   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50721   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
50722   bool LegalOperations = !DCI.isBeforeLegalizeOps();
50723 
50724   SDValue N2 = N->getOperand(2);
50725 
50726   SDValue NegN2 =
50727       TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
50728   if (!NegN2)
50729     return SDValue();
50730   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
50731 
50732   if (N->getNumOperands() == 4)
50733     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
50734                        NegN2, N->getOperand(3));
50735   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
50736                      NegN2);
50737 }
50738 
50739 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
50740                            TargetLowering::DAGCombinerInfo &DCI,
50741                            const X86Subtarget &Subtarget) {
50742   SDLoc dl(N);
50743   SDValue N0 = N->getOperand(0);
50744   EVT VT = N->getValueType(0);
50745 
50746   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
50747   // FIXME: Is this needed? We don't seem to have any tests for it.
50748   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
50749       N0.getOpcode() == X86ISD::SETCC_CARRY) {
50750     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
50751                                  N0->getOperand(1));
50752     bool ReplaceOtherUses = !N0.hasOneUse();
50753     DCI.CombineTo(N, Setcc);
50754     // Replace other uses with a truncate of the widened setcc_carry.
50755     if (ReplaceOtherUses) {
50756       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
50757                                   N0.getValueType(), Setcc);
50758       DCI.CombineTo(N0.getNode(), Trunc);
50759     }
50760 
50761     return SDValue(N, 0);
50762   }
50763 
50764   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
50765     return NewCMov;
50766 
50767   if (DCI.isBeforeLegalizeOps())
50768     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
50769       return V;
50770 
50771   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
50772     return V;
50773 
50774   if (VT.isVector())
50775     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
50776       return R;
50777 
50778   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
50779     return NewAdd;
50780 
50781   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
50782     return R;
50783 
50784   // TODO: Combine with any target/faux shuffle.
50785   if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
50786       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
50787     SDValue N00 = N0.getOperand(0);
50788     SDValue N01 = N0.getOperand(1);
50789     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
50790     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
50791     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
50792         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
50793       return concatSubVectors(N00, N01, DAG, dl);
50794     }
50795   }
50796 
50797   return SDValue();
50798 }
50799 
50800 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
50801 /// recognizable memcmp expansion.
50802 static bool isOrXorXorTree(SDValue X, bool Root = true) {
50803   if (X.getOpcode() == ISD::OR)
50804     return isOrXorXorTree(X.getOperand(0), false) &&
50805            isOrXorXorTree(X.getOperand(1), false);
50806   if (Root)
50807     return false;
50808   return X.getOpcode() == ISD::XOR;
50809 }
50810 
50811 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
50812 /// expansion.
50813 template<typename F>
50814 static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
50815                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
50816   SDValue Op0 = X.getOperand(0);
50817   SDValue Op1 = X.getOperand(1);
50818   if (X.getOpcode() == ISD::OR) {
50819     SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
50820     SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
50821     if (VecVT != CmpVT)
50822       return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
50823     if (HasPT)
50824       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
50825     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
50826   } else if (X.getOpcode() == ISD::XOR) {
50827     SDValue A = SToV(Op0);
50828     SDValue B = SToV(Op1);
50829     if (VecVT != CmpVT)
50830       return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
50831     if (HasPT)
50832       return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
50833     return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
50834   }
50835   llvm_unreachable("Impossible");
50836 }
50837 
50838 /// Try to map a 128-bit or larger integer comparison to vector instructions
50839 /// before type legalization splits it up into chunks.
50840 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
50841                                                const X86Subtarget &Subtarget) {
50842   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
50843   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
50844 
50845   // We're looking for an oversized integer equality comparison.
50846   SDValue X = SetCC->getOperand(0);
50847   SDValue Y = SetCC->getOperand(1);
50848   EVT OpVT = X.getValueType();
50849   unsigned OpSize = OpVT.getSizeInBits();
50850   if (!OpVT.isScalarInteger() || OpSize < 128)
50851     return SDValue();
50852 
50853   // Ignore a comparison with zero because that gets special treatment in
50854   // EmitTest(). But make an exception for the special case of a pair of
50855   // logically-combined vector-sized operands compared to zero. This pattern may
50856   // be generated by the memcmp expansion pass with oversized integer compares
50857   // (see PR33325).
50858   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
50859   if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
50860     return SDValue();
50861 
50862   // Don't perform this combine if constructing the vector will be expensive.
50863   auto IsVectorBitCastCheap = [](SDValue X) {
50864     X = peekThroughBitcasts(X);
50865     return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
50866            X.getOpcode() == ISD::LOAD;
50867   };
50868   if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
50869       !IsOrXorXorTreeCCZero)
50870     return SDValue();
50871 
50872   EVT VT = SetCC->getValueType(0);
50873   SDLoc DL(SetCC);
50874 
50875   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
50876   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
50877   // Otherwise use PCMPEQ (plus AND) and mask testing.
50878   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
50879       (OpSize == 256 && Subtarget.hasAVX()) ||
50880       (OpSize == 512 && Subtarget.useAVX512Regs())) {
50881     bool HasPT = Subtarget.hasSSE41();
50882 
50883     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
50884     // vector registers are essentially free. (Technically, widening registers
50885     // prevents load folding, but the tradeoff is worth it.)
50886     bool PreferKOT = Subtarget.preferMaskRegisters();
50887     bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
50888 
50889     EVT VecVT = MVT::v16i8;
50890     EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
50891     if (OpSize == 256) {
50892       VecVT = MVT::v32i8;
50893       CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
50894     }
50895     EVT CastVT = VecVT;
50896     bool NeedsAVX512FCast = false;
50897     if (OpSize == 512 || NeedZExt) {
50898       if (Subtarget.hasBWI()) {
50899         VecVT = MVT::v64i8;
50900         CmpVT = MVT::v64i1;
50901         if (OpSize == 512)
50902           CastVT = VecVT;
50903       } else {
50904         VecVT = MVT::v16i32;
50905         CmpVT = MVT::v16i1;
50906         CastVT = OpSize == 512 ? VecVT :
50907                  OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
50908         NeedsAVX512FCast = true;
50909       }
50910     }
50911 
50912     auto ScalarToVector = [&](SDValue X) -> SDValue {
50913       bool TmpZext = false;
50914       EVT TmpCastVT = CastVT;
50915       if (X.getOpcode() == ISD::ZERO_EXTEND) {
50916         SDValue OrigX = X.getOperand(0);
50917         unsigned OrigSize = OrigX.getScalarValueSizeInBits();
50918         if (OrigSize < OpSize) {
50919           if (OrigSize == 128) {
50920             TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
50921             X = OrigX;
50922             TmpZext = true;
50923           } else if (OrigSize == 256) {
50924             TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
50925             X = OrigX;
50926             TmpZext = true;
50927           }
50928         }
50929       }
50930       X = DAG.getBitcast(TmpCastVT, X);
50931       if (!NeedZExt && !TmpZext)
50932         return X;
50933       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
50934                          DAG.getConstant(0, DL, VecVT), X,
50935                          DAG.getVectorIdxConstant(0, DL));
50936     };
50937 
50938     SDValue Cmp;
50939     if (IsOrXorXorTreeCCZero) {
50940       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
50941       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
50942       // Use 2 vector equality compares and 'and' the results before doing a
50943       // MOVMSK.
50944       Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
50945     } else {
50946       SDValue VecX = ScalarToVector(X);
50947       SDValue VecY = ScalarToVector(Y);
50948       if (VecVT != CmpVT) {
50949         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
50950       } else if (HasPT) {
50951         Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
50952       } else {
50953         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
50954       }
50955     }
50956     // AVX512 should emit a setcc that will lower to kortest.
50957     if (VecVT != CmpVT) {
50958       EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
50959                    CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
50960       return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
50961                           DAG.getConstant(0, DL, KRegVT), CC);
50962     }
50963     if (HasPT) {
50964       SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
50965                                      Cmp);
50966       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
50967       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
50968       SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
50969       return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
50970     }
50971     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
50972     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
50973     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
50974     assert(Cmp.getValueType() == MVT::v16i8 &&
50975            "Non 128-bit vector on pre-SSE41 target");
50976     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
50977     SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
50978     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
50979   }
50980 
50981   return SDValue();
50982 }
50983 
50984 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
50985                             TargetLowering::DAGCombinerInfo &DCI,
50986                             const X86Subtarget &Subtarget) {
50987   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
50988   const SDValue LHS = N->getOperand(0);
50989   const SDValue RHS = N->getOperand(1);
50990   EVT VT = N->getValueType(0);
50991   EVT OpVT = LHS.getValueType();
50992   SDLoc DL(N);
50993 
50994   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
50995     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
50996       return V;
50997 
50998     if (VT == MVT::i1 && isNullConstant(RHS)) {
50999       SDValue X86CC;
51000       if (SDValue V =
51001               MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
51002         return DAG.getNode(ISD::TRUNCATE, DL, VT,
51003                            DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
51004     }
51005 
51006     if (OpVT.isScalarInteger()) {
51007       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
51008       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
51009       auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
51010         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
51011           if (N0.getOperand(0) == N1)
51012             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
51013                                N0.getOperand(1));
51014           if (N0.getOperand(1) == N1)
51015             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
51016                                N0.getOperand(0));
51017         }
51018         return SDValue();
51019       };
51020       if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
51021         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51022       if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
51023         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51024 
51025       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
51026       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
51027       auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
51028         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
51029           if (N0.getOperand(0) == N1)
51030             return DAG.getNode(ISD::AND, DL, OpVT, N1,
51031                                DAG.getNOT(DL, N0.getOperand(1), OpVT));
51032           if (N0.getOperand(1) == N1)
51033             return DAG.getNode(ISD::AND, DL, OpVT, N1,
51034                                DAG.getNOT(DL, N0.getOperand(0), OpVT));
51035         }
51036         return SDValue();
51037       };
51038       if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
51039         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51040       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
51041         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
51042 
51043       // cmpeq(trunc(x),0) --> cmpeq(x,0)
51044       // cmpne(trunc(x),0) --> cmpne(x,0)
51045       // iff x upper bits are zero.
51046       // TODO: Add support for RHS to be truncate as well?
51047       if (LHS.getOpcode() == ISD::TRUNCATE &&
51048           LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
51049           isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
51050         EVT SrcVT = LHS.getOperand(0).getValueType();
51051         APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
51052                                                 OpVT.getScalarSizeInBits());
51053         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51054         if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
51055             TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
51056           return DAG.getSetCC(DL, VT, LHS.getOperand(0),
51057                               DAG.getConstant(0, DL, SrcVT), CC);
51058       }
51059     }
51060   }
51061 
51062   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
51063       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
51064     // Using temporaries to avoid messing up operand ordering for later
51065     // transformations if this doesn't work.
51066     SDValue Op0 = LHS;
51067     SDValue Op1 = RHS;
51068     ISD::CondCode TmpCC = CC;
51069     // Put build_vector on the right.
51070     if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
51071       std::swap(Op0, Op1);
51072       TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
51073     }
51074 
51075     bool IsSEXT0 =
51076         (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
51077         (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
51078     bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
51079 
51080     if (IsSEXT0 && IsVZero1) {
51081       assert(VT == Op0.getOperand(0).getValueType() &&
51082              "Unexpected operand type");
51083       if (TmpCC == ISD::SETGT)
51084         return DAG.getConstant(0, DL, VT);
51085       if (TmpCC == ISD::SETLE)
51086         return DAG.getConstant(1, DL, VT);
51087       if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
51088         return DAG.getNOT(DL, Op0.getOperand(0), VT);
51089 
51090       assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
51091              "Unexpected condition code!");
51092       return Op0.getOperand(0);
51093     }
51094   }
51095 
51096   // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
51097   // pre-promote its result type since vXi1 vectors don't get promoted
51098   // during type legalization.
51099   // NOTE: The element count check is to ignore operand types that need to
51100   // go through type promotion to a 128-bit vector.
51101   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
51102       VT.getVectorElementType() == MVT::i1 &&
51103       (OpVT.getVectorElementType() == MVT::i8 ||
51104        OpVT.getVectorElementType() == MVT::i16)) {
51105     SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
51106     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
51107   }
51108 
51109   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
51110   // to avoid scalarization via legalization because v4i32 is not a legal type.
51111   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
51112       LHS.getValueType() == MVT::v4f32)
51113     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
51114 
51115   return SDValue();
51116 }
51117 
51118 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
51119                              TargetLowering::DAGCombinerInfo &DCI,
51120                              const X86Subtarget &Subtarget) {
51121   SDValue Src = N->getOperand(0);
51122   MVT SrcVT = Src.getSimpleValueType();
51123   MVT VT = N->getSimpleValueType(0);
51124   unsigned NumBits = VT.getScalarSizeInBits();
51125   unsigned NumElts = SrcVT.getVectorNumElements();
51126 
51127   // Perform constant folding.
51128   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
51129     assert(VT == MVT::i32 && "Unexpected result type");
51130     APInt Imm(32, 0);
51131     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
51132       if (!Src.getOperand(Idx).isUndef() &&
51133           Src.getConstantOperandAPInt(Idx).isNegative())
51134         Imm.setBit(Idx);
51135     }
51136     return DAG.getConstant(Imm, SDLoc(N), VT);
51137   }
51138 
51139   // Look through int->fp bitcasts that don't change the element width.
51140   unsigned EltWidth = SrcVT.getScalarSizeInBits();
51141   if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
51142       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
51143     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
51144 
51145   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
51146   // with scalar comparisons.
51147   if (SDValue NotSrc = IsNOT(Src, DAG)) {
51148     SDLoc DL(N);
51149     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
51150     NotSrc = DAG.getBitcast(SrcVT, NotSrc);
51151     return DAG.getNode(ISD::XOR, DL, VT,
51152                        DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
51153                        DAG.getConstant(NotMask, DL, VT));
51154   }
51155 
51156   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
51157   // results with scalar comparisons.
51158   if (Src.getOpcode() == X86ISD::PCMPGT &&
51159       ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
51160     SDLoc DL(N);
51161     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
51162     return DAG.getNode(ISD::XOR, DL, VT,
51163                        DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
51164                        DAG.getConstant(NotMask, DL, VT));
51165   }
51166 
51167   // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
51168   // iff pow2splat(c1).
51169   if (Src.getOpcode() == X86ISD::PCMPEQ &&
51170       Src.getOperand(0).getOpcode() == ISD::AND &&
51171       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
51172     SDValue LHS = Src.getOperand(0).getOperand(0);
51173     SDValue RHS = Src.getOperand(0).getOperand(1);
51174     KnownBits KnownRHS = DAG.computeKnownBits(RHS);
51175     if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {
51176       SDLoc DL(N);
51177       MVT ShiftVT = SrcVT;
51178       if (ShiftVT.getScalarType() == MVT::i8) {
51179         // vXi8 shifts - we only care about the signbit so can use PSLLW.
51180         ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
51181         LHS = DAG.getBitcast(ShiftVT, LHS);
51182       }
51183       unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();
51184       LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,
51185                                        ShiftAmt, DAG);
51186       LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);
51187       return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);
51188     }
51189   }
51190 
51191   // Simplify the inputs.
51192   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51193   APInt DemandedMask(APInt::getAllOnes(NumBits));
51194   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51195     return SDValue(N, 0);
51196 
51197   return SDValue();
51198 }
51199 
51200 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
51201                                        TargetLowering::DAGCombinerInfo &DCI,
51202                                        const X86Subtarget &Subtarget) {
51203   auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
51204   SDValue BasePtr = MemOp->getBasePtr();
51205   SDValue Index = MemOp->getIndex();
51206   SDValue Scale = MemOp->getScale();
51207   SDValue Mask = MemOp->getMask();
51208 
51209   // Attempt to fold an index scale into the scale value directly.
51210   // For smaller indices, implicit sext is performed BEFORE scale, preventing
51211   // this fold under most circumstances.
51212   // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
51213   if ((Index.getOpcode() == X86ISD::VSHLI ||
51214        (Index.getOpcode() == ISD::ADD &&
51215         Index.getOperand(0) == Index.getOperand(1))) &&
51216       isa<ConstantSDNode>(Scale) &&
51217       BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
51218     unsigned ShiftAmt =
51219         Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
51220     uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
51221     uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
51222     if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
51223       SDValue NewIndex = Index.getOperand(0);
51224       SDValue NewScale =
51225           DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
51226       if (N->getOpcode() == X86ISD::MGATHER)
51227         return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
51228                                  MemOp->getOperand(1), Mask,
51229                                  MemOp->getBasePtr(), NewIndex, NewScale,
51230                                  MemOp->getChain(), Subtarget);
51231       if (N->getOpcode() == X86ISD::MSCATTER)
51232         return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
51233                               MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
51234                               NewIndex, NewScale, MemOp->getChain(), Subtarget);
51235     }
51236   }
51237 
51238   // With vector masks we only demand the upper bit of the mask.
51239   if (Mask.getScalarValueSizeInBits() != 1) {
51240     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51241     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
51242     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
51243       if (N->getOpcode() != ISD::DELETED_NODE)
51244         DCI.AddToWorklist(N);
51245       return SDValue(N, 0);
51246     }
51247   }
51248 
51249   return SDValue();
51250 }
51251 
51252 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
51253                                     SDValue Index, SDValue Base, SDValue Scale,
51254                                     SelectionDAG &DAG) {
51255   SDLoc DL(GorS);
51256 
51257   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
51258     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
51259                       Gather->getMask(), Base, Index, Scale } ;
51260     return DAG.getMaskedGather(Gather->getVTList(),
51261                                Gather->getMemoryVT(), DL, Ops,
51262                                Gather->getMemOperand(),
51263                                Gather->getIndexType(),
51264                                Gather->getExtensionType());
51265   }
51266   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
51267   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
51268                     Scatter->getMask(), Base, Index, Scale };
51269   return DAG.getMaskedScatter(Scatter->getVTList(),
51270                               Scatter->getMemoryVT(), DL,
51271                               Ops, Scatter->getMemOperand(),
51272                               Scatter->getIndexType(),
51273                               Scatter->isTruncatingStore());
51274 }
51275 
51276 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
51277                                     TargetLowering::DAGCombinerInfo &DCI) {
51278   SDLoc DL(N);
51279   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
51280   SDValue Index = GorS->getIndex();
51281   SDValue Base = GorS->getBasePtr();
51282   SDValue Scale = GorS->getScale();
51283 
51284   if (DCI.isBeforeLegalize()) {
51285     unsigned IndexWidth = Index.getScalarValueSizeInBits();
51286 
51287     // Shrink constant indices if they are larger than 32-bits.
51288     // Only do this before legalize types since v2i64 could become v2i32.
51289     // FIXME: We could check that the type is legal if we're after legalize
51290     // types, but then we would need to construct test cases where that happens.
51291     // FIXME: We could support more than just constant vectors, but we need to
51292     // careful with costing. A truncate that can be optimized out would be fine.
51293     // Otherwise we might only want to create a truncate if it avoids a split.
51294     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
51295       if (BV->isConstant() && IndexWidth > 32 &&
51296           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
51297         EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
51298         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
51299         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51300       }
51301     }
51302 
51303     // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
51304     // there are sufficient sign bits. Only do this before legalize types to
51305     // avoid creating illegal types in truncate.
51306     if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
51307          Index.getOpcode() == ISD::ZERO_EXTEND) &&
51308         IndexWidth > 32 &&
51309         Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
51310         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
51311       EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
51312       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
51313       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51314     }
51315   }
51316 
51317   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51318   EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51319   // Try to move splat constant adders from the index operand to the base
51320   // pointer operand. Taking care to multiply by the scale. We can only do
51321   // this when index element type is the same as the pointer type.
51322   // Otherwise we need to be sure the math doesn't wrap before the scale.
51323   if (Index.getOpcode() == ISD::ADD &&
51324       Index.getValueType().getVectorElementType() == PtrVT &&
51325       isa<ConstantSDNode>(Scale)) {
51326     uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
51327     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
51328       BitVector UndefElts;
51329       if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
51330         // FIXME: Allow non-constant?
51331         if (UndefElts.none()) {
51332           // Apply the scale.
51333           APInt Adder = C->getAPIntValue() * ScaleAmt;
51334           // Add it to the existing base.
51335           Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
51336                              DAG.getConstant(Adder, DL, PtrVT));
51337           Index = Index.getOperand(0);
51338           return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51339         }
51340       }
51341 
51342       // It's also possible base is just a constant. In that case, just
51343       // replace it with 0 and move the displacement into the index.
51344       if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
51345           isOneConstant(Scale)) {
51346         SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
51347         // Combine the constant build_vector and the constant base.
51348         Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
51349                             Index.getOperand(1), Splat);
51350         // Add to the LHS of the original Index add.
51351         Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
51352                             Index.getOperand(0), Splat);
51353         Base = DAG.getConstant(0, DL, Base.getValueType());
51354         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51355       }
51356     }
51357   }
51358 
51359   if (DCI.isBeforeLegalizeOps()) {
51360     unsigned IndexWidth = Index.getScalarValueSizeInBits();
51361 
51362     // Make sure the index is either i32 or i64
51363     if (IndexWidth != 32 && IndexWidth != 64) {
51364       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
51365       EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
51366       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
51367       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
51368     }
51369   }
51370 
51371   // With vector masks we only demand the upper bit of the mask.
51372   SDValue Mask = GorS->getMask();
51373   if (Mask.getScalarValueSizeInBits() != 1) {
51374     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51375     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
51376     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
51377       if (N->getOpcode() != ISD::DELETED_NODE)
51378         DCI.AddToWorklist(N);
51379       return SDValue(N, 0);
51380     }
51381   }
51382 
51383   return SDValue();
51384 }
51385 
51386 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
51387 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
51388                                const X86Subtarget &Subtarget) {
51389   SDLoc DL(N);
51390   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
51391   SDValue EFLAGS = N->getOperand(1);
51392 
51393   // Try to simplify the EFLAGS and condition code operands.
51394   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
51395     return getSETCC(CC, Flags, DL, DAG);
51396 
51397   return SDValue();
51398 }
51399 
51400 /// Optimize branch condition evaluation.
51401 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
51402                              const X86Subtarget &Subtarget) {
51403   SDLoc DL(N);
51404   SDValue EFLAGS = N->getOperand(3);
51405   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
51406 
51407   // Try to simplify the EFLAGS and condition code operands.
51408   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
51409   // RAUW them under us.
51410   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
51411     SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
51412     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
51413                        N->getOperand(1), Cond, Flags);
51414   }
51415 
51416   return SDValue();
51417 }
51418 
51419 // TODO: Could we move this to DAGCombine?
51420 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
51421                                                   SelectionDAG &DAG) {
51422   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
51423   // to optimize away operation when it's from a constant.
51424   //
51425   // The general transformation is:
51426   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
51427   //       AND(VECTOR_CMP(x,y), constant2)
51428   //    constant2 = UNARYOP(constant)
51429 
51430   // Early exit if this isn't a vector operation, the operand of the
51431   // unary operation isn't a bitwise AND, or if the sizes of the operations
51432   // aren't the same.
51433   EVT VT = N->getValueType(0);
51434   bool IsStrict = N->isStrictFPOpcode();
51435   unsigned NumEltBits = VT.getScalarSizeInBits();
51436   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51437   if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
51438       DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
51439       VT.getSizeInBits() != Op0.getValueSizeInBits())
51440     return SDValue();
51441 
51442   // Now check that the other operand of the AND is a constant. We could
51443   // make the transformation for non-constant splats as well, but it's unclear
51444   // that would be a benefit as it would not eliminate any operations, just
51445   // perform one more step in scalar code before moving to the vector unit.
51446   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
51447     // Bail out if the vector isn't a constant.
51448     if (!BV->isConstant())
51449       return SDValue();
51450 
51451     // Everything checks out. Build up the new and improved node.
51452     SDLoc DL(N);
51453     EVT IntVT = BV->getValueType(0);
51454     // Create a new constant of the appropriate type for the transformed
51455     // DAG.
51456     SDValue SourceConst;
51457     if (IsStrict)
51458       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
51459                                 {N->getOperand(0), SDValue(BV, 0)});
51460     else
51461       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
51462     // The AND node needs bitcasts to/from an integer vector type around it.
51463     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
51464     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
51465                                  MaskConst);
51466     SDValue Res = DAG.getBitcast(VT, NewAnd);
51467     if (IsStrict)
51468       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
51469     return Res;
51470   }
51471 
51472   return SDValue();
51473 }
51474 
51475 /// If we are converting a value to floating-point, try to replace scalar
51476 /// truncate of an extracted vector element with a bitcast. This tries to keep
51477 /// the sequence on XMM registers rather than moving between vector and GPRs.
51478 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
51479   // TODO: This is currently only used by combineSIntToFP, but it is generalized
51480   //       to allow being called by any similar cast opcode.
51481   // TODO: Consider merging this into lowering: vectorizeExtractedCast().
51482   SDValue Trunc = N->getOperand(0);
51483   if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
51484     return SDValue();
51485 
51486   SDValue ExtElt = Trunc.getOperand(0);
51487   if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51488       !isNullConstant(ExtElt.getOperand(1)))
51489     return SDValue();
51490 
51491   EVT TruncVT = Trunc.getValueType();
51492   EVT SrcVT = ExtElt.getValueType();
51493   unsigned DestWidth = TruncVT.getSizeInBits();
51494   unsigned SrcWidth = SrcVT.getSizeInBits();
51495   if (SrcWidth % DestWidth != 0)
51496     return SDValue();
51497 
51498   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
51499   EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
51500   unsigned VecWidth = SrcVecVT.getSizeInBits();
51501   unsigned NumElts = VecWidth / DestWidth;
51502   EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
51503   SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
51504   SDLoc DL(N);
51505   SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
51506                                   BitcastVec, ExtElt.getOperand(1));
51507   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
51508 }
51509 
51510 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
51511                                const X86Subtarget &Subtarget) {
51512   bool IsStrict = N->isStrictFPOpcode();
51513   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51514   EVT VT = N->getValueType(0);
51515   EVT InVT = Op0.getValueType();
51516 
51517   // UINT_TO_FP(vXi1~15)  -> UINT_TO_FP(ZEXT(vXi1~15  to vXi16))
51518   // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
51519   // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
51520   if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
51521     unsigned ScalarSize = InVT.getScalarSizeInBits();
51522     if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
51523       return SDValue();
51524     SDLoc dl(N);
51525     EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
51526                                  ScalarSize < 16   ? MVT::i16
51527                                  : ScalarSize < 32 ? MVT::i32
51528                                                    : MVT::i64,
51529                                  InVT.getVectorNumElements());
51530     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
51531     if (IsStrict)
51532       return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
51533                          {N->getOperand(0), P});
51534     return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
51535   }
51536 
51537   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
51538   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
51539   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
51540   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
51541       VT.getScalarType() != MVT::f16) {
51542     SDLoc dl(N);
51543     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
51544     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
51545 
51546     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
51547     if (IsStrict)
51548       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
51549                          {N->getOperand(0), P});
51550     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
51551   }
51552 
51553   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
51554   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
51555   // the optimization here.
51556   if (DAG.SignBitIsZero(Op0)) {
51557     if (IsStrict)
51558       return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
51559                          {N->getOperand(0), Op0});
51560     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
51561   }
51562 
51563   return SDValue();
51564 }
51565 
51566 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
51567                                TargetLowering::DAGCombinerInfo &DCI,
51568                                const X86Subtarget &Subtarget) {
51569   // First try to optimize away the conversion entirely when it's
51570   // conditionally from a constant. Vectors only.
51571   bool IsStrict = N->isStrictFPOpcode();
51572   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
51573     return Res;
51574 
51575   // Now move on to more general possibilities.
51576   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
51577   EVT VT = N->getValueType(0);
51578   EVT InVT = Op0.getValueType();
51579 
51580   // SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))
51581   // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
51582   // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
51583   if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
51584     unsigned ScalarSize = InVT.getScalarSizeInBits();
51585     if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
51586       return SDValue();
51587     SDLoc dl(N);
51588     EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
51589                                  ScalarSize < 16   ? MVT::i16
51590                                  : ScalarSize < 32 ? MVT::i32
51591                                                    : MVT::i64,
51592                                  InVT.getVectorNumElements());
51593     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
51594     if (IsStrict)
51595       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
51596                          {N->getOperand(0), P});
51597     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
51598   }
51599 
51600   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
51601   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
51602   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
51603   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
51604       VT.getScalarType() != MVT::f16) {
51605     SDLoc dl(N);
51606     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
51607     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
51608     if (IsStrict)
51609       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
51610                          {N->getOperand(0), P});
51611     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
51612   }
51613 
51614   // Without AVX512DQ we only support i64 to float scalar conversion. For both
51615   // vectors and scalars, see if we know that the upper bits are all the sign
51616   // bit, in which case we can truncate the input to i32 and convert from that.
51617   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
51618     unsigned BitWidth = InVT.getScalarSizeInBits();
51619     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
51620     if (NumSignBits >= (BitWidth - 31)) {
51621       EVT TruncVT = MVT::i32;
51622       if (InVT.isVector())
51623         TruncVT = InVT.changeVectorElementType(TruncVT);
51624       SDLoc dl(N);
51625       if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
51626         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
51627         if (IsStrict)
51628           return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
51629                              {N->getOperand(0), Trunc});
51630         return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
51631       }
51632       // If we're after legalize and the type is v2i32 we need to shuffle and
51633       // use CVTSI2P.
51634       assert(InVT == MVT::v2i64 && "Unexpected VT!");
51635       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
51636       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
51637                                           { 0, 2, -1, -1 });
51638       if (IsStrict)
51639         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
51640                            {N->getOperand(0), Shuf});
51641       return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
51642     }
51643   }
51644 
51645   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
51646   // a 32-bit target where SSE doesn't support i64->FP operations.
51647   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
51648       Op0.getOpcode() == ISD::LOAD) {
51649     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
51650 
51651     // This transformation is not supported if the result type is f16 or f128.
51652     if (VT == MVT::f16 || VT == MVT::f128)
51653       return SDValue();
51654 
51655     // If we have AVX512DQ we can use packed conversion instructions unless
51656     // the VT is f80.
51657     if (Subtarget.hasDQI() && VT != MVT::f80)
51658       return SDValue();
51659 
51660     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
51661         Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
51662       std::pair<SDValue, SDValue> Tmp =
51663           Subtarget.getTargetLowering()->BuildFILD(
51664               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
51665               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
51666       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
51667       return Tmp.first;
51668     }
51669   }
51670 
51671   if (IsStrict)
51672     return SDValue();
51673 
51674   if (SDValue V = combineToFPTruncExtElt(N, DAG))
51675     return V;
51676 
51677   return SDValue();
51678 }
51679 
51680 static bool needCarryOrOverflowFlag(SDValue Flags) {
51681   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
51682 
51683   for (const SDNode *User : Flags->uses()) {
51684     X86::CondCode CC;
51685     switch (User->getOpcode()) {
51686     default:
51687       // Be conservative.
51688       return true;
51689     case X86ISD::SETCC:
51690     case X86ISD::SETCC_CARRY:
51691       CC = (X86::CondCode)User->getConstantOperandVal(0);
51692       break;
51693     case X86ISD::BRCOND:
51694       CC = (X86::CondCode)User->getConstantOperandVal(2);
51695       break;
51696     case X86ISD::CMOV:
51697       CC = (X86::CondCode)User->getConstantOperandVal(2);
51698       break;
51699     }
51700 
51701     switch (CC) {
51702     default: break;
51703     case X86::COND_A: case X86::COND_AE:
51704     case X86::COND_B: case X86::COND_BE:
51705     case X86::COND_O: case X86::COND_NO:
51706     case X86::COND_G: case X86::COND_GE:
51707     case X86::COND_L: case X86::COND_LE:
51708       return true;
51709     }
51710   }
51711 
51712   return false;
51713 }
51714 
51715 static bool onlyZeroFlagUsed(SDValue Flags) {
51716   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
51717 
51718   for (const SDNode *User : Flags->uses()) {
51719     unsigned CCOpNo;
51720     switch (User->getOpcode()) {
51721     default:
51722       // Be conservative.
51723       return false;
51724     case X86ISD::SETCC:       CCOpNo = 0; break;
51725     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
51726     case X86ISD::BRCOND:      CCOpNo = 2; break;
51727     case X86ISD::CMOV:        CCOpNo = 2; break;
51728     }
51729 
51730     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
51731     if (CC != X86::COND_E && CC != X86::COND_NE)
51732       return false;
51733   }
51734 
51735   return true;
51736 }
51737 
51738 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
51739   // Only handle test patterns.
51740   if (!isNullConstant(N->getOperand(1)))
51741     return SDValue();
51742 
51743   // If we have a CMP of a truncated binop, see if we can make a smaller binop
51744   // and use its flags directly.
51745   // TODO: Maybe we should try promoting compares that only use the zero flag
51746   // first if we can prove the upper bits with computeKnownBits?
51747   SDLoc dl(N);
51748   SDValue Op = N->getOperand(0);
51749   EVT VT = Op.getValueType();
51750 
51751   // If we have a constant logical shift that's only used in a comparison
51752   // against zero turn it into an equivalent AND. This allows turning it into
51753   // a TEST instruction later.
51754   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
51755       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
51756       onlyZeroFlagUsed(SDValue(N, 0))) {
51757     unsigned BitWidth = VT.getSizeInBits();
51758     const APInt &ShAmt = Op.getConstantOperandAPInt(1);
51759     if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
51760       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
51761       APInt Mask = Op.getOpcode() == ISD::SRL
51762                        ? APInt::getHighBitsSet(BitWidth, MaskBits)
51763                        : APInt::getLowBitsSet(BitWidth, MaskBits);
51764       if (Mask.isSignedIntN(32)) {
51765         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
51766                          DAG.getConstant(Mask, dl, VT));
51767         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
51768                            DAG.getConstant(0, dl, VT));
51769       }
51770     }
51771   }
51772 
51773   // Look for a truncate.
51774   if (Op.getOpcode() != ISD::TRUNCATE)
51775     return SDValue();
51776 
51777   SDValue Trunc = Op;
51778   Op = Op.getOperand(0);
51779 
51780   // See if we can compare with zero against the truncation source,
51781   // which should help using the Z flag from many ops. Only do this for
51782   // i32 truncated op to prevent partial-reg compares of promoted ops.
51783   EVT OpVT = Op.getValueType();
51784   APInt UpperBits =
51785       APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
51786   if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
51787       onlyZeroFlagUsed(SDValue(N, 0))) {
51788     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
51789                        DAG.getConstant(0, dl, OpVT));
51790   }
51791 
51792   // After this the truncate and arithmetic op must have a single use.
51793   if (!Trunc.hasOneUse() || !Op.hasOneUse())
51794       return SDValue();
51795 
51796   unsigned NewOpc;
51797   switch (Op.getOpcode()) {
51798   default: return SDValue();
51799   case ISD::AND:
51800     // Skip and with constant. We have special handling for and with immediate
51801     // during isel to generate test instructions.
51802     if (isa<ConstantSDNode>(Op.getOperand(1)))
51803       return SDValue();
51804     NewOpc = X86ISD::AND;
51805     break;
51806   case ISD::OR:  NewOpc = X86ISD::OR;  break;
51807   case ISD::XOR: NewOpc = X86ISD::XOR; break;
51808   case ISD::ADD:
51809     // If the carry or overflow flag is used, we can't truncate.
51810     if (needCarryOrOverflowFlag(SDValue(N, 0)))
51811       return SDValue();
51812     NewOpc = X86ISD::ADD;
51813     break;
51814   case ISD::SUB:
51815     // If the carry or overflow flag is used, we can't truncate.
51816     if (needCarryOrOverflowFlag(SDValue(N, 0)))
51817       return SDValue();
51818     NewOpc = X86ISD::SUB;
51819     break;
51820   }
51821 
51822   // We found an op we can narrow. Truncate its inputs.
51823   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
51824   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
51825 
51826   // Use a X86 specific opcode to avoid DAG combine messing with it.
51827   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51828   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
51829 
51830   // For AND, keep a CMP so that we can match the test pattern.
51831   if (NewOpc == X86ISD::AND)
51832     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
51833                        DAG.getConstant(0, dl, VT));
51834 
51835   // Return the flags.
51836   return Op.getValue(1);
51837 }
51838 
51839 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
51840                                 TargetLowering::DAGCombinerInfo &DCI) {
51841   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
51842          "Expected X86ISD::ADD or X86ISD::SUB");
51843 
51844   SDLoc DL(N);
51845   SDValue LHS = N->getOperand(0);
51846   SDValue RHS = N->getOperand(1);
51847   MVT VT = LHS.getSimpleValueType();
51848   unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
51849 
51850   // If we don't use the flag result, simplify back to a generic ADD/SUB.
51851   if (!N->hasAnyUseOfValue(1)) {
51852     SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
51853     return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
51854   }
51855 
51856   // Fold any similar generic ADD/SUB opcodes to reuse this node.
51857   auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
51858     SDValue Ops[] = {N0, N1};
51859     SDVTList VTs = DAG.getVTList(N->getValueType(0));
51860     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
51861       SDValue Op(N, 0);
51862       if (Negate)
51863         Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
51864       DCI.CombineTo(GenericAddSub, Op);
51865     }
51866   };
51867   MatchGeneric(LHS, RHS, false);
51868   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
51869 
51870   return SDValue();
51871 }
51872 
51873 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
51874   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
51875     MVT VT = N->getSimpleValueType(0);
51876     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51877     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
51878                        N->getOperand(0), N->getOperand(1),
51879                        Flags);
51880   }
51881 
51882   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
51883   // iff the flag result is dead.
51884   SDValue Op0 = N->getOperand(0);
51885   SDValue Op1 = N->getOperand(1);
51886   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
51887       !N->hasAnyUseOfValue(1))
51888     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
51889                        Op0.getOperand(1), N->getOperand(2));
51890 
51891   return SDValue();
51892 }
51893 
51894 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
51895 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
51896                           TargetLowering::DAGCombinerInfo &DCI) {
51897   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
51898   // the result is either zero or one (depending on the input carry bit).
51899   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
51900   if (X86::isZeroNode(N->getOperand(0)) &&
51901       X86::isZeroNode(N->getOperand(1)) &&
51902       // We don't have a good way to replace an EFLAGS use, so only do this when
51903       // dead right now.
51904       SDValue(N, 1).use_empty()) {
51905     SDLoc DL(N);
51906     EVT VT = N->getValueType(0);
51907     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
51908     SDValue Res1 =
51909         DAG.getNode(ISD::AND, DL, VT,
51910                     DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51911                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51912                                 N->getOperand(2)),
51913                     DAG.getConstant(1, DL, VT));
51914     return DCI.CombineTo(N, Res1, CarryOut);
51915   }
51916 
51917   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
51918     MVT VT = N->getSimpleValueType(0);
51919     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51920     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
51921                        N->getOperand(0), N->getOperand(1),
51922                        Flags);
51923   }
51924 
51925   return SDValue();
51926 }
51927 
51928 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
51929 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51930 /// with CMP+{ADC, SBB}.
51931 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
51932   bool IsSub = N->getOpcode() == ISD::SUB;
51933   SDValue X = N->getOperand(0);
51934   SDValue Y = N->getOperand(1);
51935 
51936   // If this is an add, canonicalize a zext operand to the RHS.
51937   // TODO: Incomplete? What if both sides are zexts?
51938   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
51939       Y.getOpcode() != ISD::ZERO_EXTEND)
51940     std::swap(X, Y);
51941 
51942   // Look through a one-use zext.
51943   bool PeekedThroughZext = false;
51944   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
51945     Y = Y.getOperand(0);
51946     PeekedThroughZext = true;
51947   }
51948 
51949   // If this is an add, canonicalize a setcc operand to the RHS.
51950   // TODO: Incomplete? What if both sides are setcc?
51951   // TODO: Should we allow peeking through a zext of the other operand?
51952   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
51953       Y.getOpcode() != X86ISD::SETCC)
51954     std::swap(X, Y);
51955 
51956   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
51957     return SDValue();
51958 
51959   SDLoc DL(N);
51960   EVT VT = N->getValueType(0);
51961   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
51962 
51963   // If X is -1 or 0, then we have an opportunity to avoid constants required in
51964   // the general case below.
51965   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
51966   if (ConstantX) {
51967     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51968         (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51969       // This is a complicated way to get -1 or 0 from the carry flag:
51970       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51971       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51972       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51973                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51974                          Y.getOperand(1));
51975     }
51976 
51977     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51978         (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51979       SDValue EFLAGS = Y->getOperand(1);
51980       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
51981           EFLAGS.getValueType().isInteger() &&
51982           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51983         // Swap the operands of a SUB, and we have the same pattern as above.
51984         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51985         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
51986         SDValue NewSub = DAG.getNode(
51987             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51988             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51989         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
51990         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51991                            DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51992                            NewEFLAGS);
51993       }
51994     }
51995   }
51996 
51997   if (CC == X86::COND_B) {
51998     // X + SETB Z --> adc X, 0
51999     // X - SETB Z --> sbb X, 0
52000     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52001                        DAG.getVTList(VT, MVT::i32), X,
52002                        DAG.getConstant(0, DL, VT), Y.getOperand(1));
52003   }
52004 
52005   if (CC == X86::COND_A) {
52006     SDValue EFLAGS = Y.getOperand(1);
52007     // Try to convert COND_A into COND_B in an attempt to facilitate
52008     // materializing "setb reg".
52009     //
52010     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52011     // cannot take an immediate as its first operand.
52012     //
52013     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52014         EFLAGS.getValueType().isInteger() &&
52015         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52016       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
52017                                    EFLAGS.getNode()->getVTList(),
52018                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52019       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52020       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52021                          DAG.getVTList(VT, MVT::i32), X,
52022                          DAG.getConstant(0, DL, VT), NewEFLAGS);
52023     }
52024   }
52025 
52026   if (CC == X86::COND_AE) {
52027     // X + SETAE --> sbb X, -1
52028     // X - SETAE --> adc X, -1
52029     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52030                        DAG.getVTList(VT, MVT::i32), X,
52031                        DAG.getConstant(-1, DL, VT), Y.getOperand(1));
52032   }
52033 
52034   if (CC == X86::COND_BE) {
52035     // X + SETBE --> sbb X, -1
52036     // X - SETBE --> adc X, -1
52037     SDValue EFLAGS = Y.getOperand(1);
52038     // Try to convert COND_BE into COND_AE in an attempt to facilitate
52039     // materializing "setae reg".
52040     //
52041     // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52042     // cannot take an immediate as its first operand.
52043     //
52044     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52045         EFLAGS.getValueType().isInteger() &&
52046         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52047       SDValue NewSub = DAG.getNode(
52048           X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52049           EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52050       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52051       return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52052                          DAG.getVTList(VT, MVT::i32), X,
52053                          DAG.getConstant(-1, DL, VT), NewEFLAGS);
52054     }
52055   }
52056 
52057   if (CC != X86::COND_E && CC != X86::COND_NE)
52058     return SDValue();
52059 
52060   SDValue Cmp = Y.getOperand(1);
52061   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
52062       !X86::isZeroNode(Cmp.getOperand(1)) ||
52063       !Cmp.getOperand(0).getValueType().isInteger())
52064     return SDValue();
52065 
52066   SDValue Z = Cmp.getOperand(0);
52067   EVT ZVT = Z.getValueType();
52068 
52069   // If X is -1 or 0, then we have an opportunity to avoid constants required in
52070   // the general case below.
52071   if (ConstantX) {
52072     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52073     // fake operands:
52074     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52075     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52076     if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52077         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52078       SDValue Zero = DAG.getConstant(0, DL, ZVT);
52079       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52080       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52081       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52082                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52083                          SDValue(Neg.getNode(), 1));
52084     }
52085 
52086     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52087     // with fake operands:
52088     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52089     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52090     if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52091         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52092       SDValue One = DAG.getConstant(1, DL, ZVT);
52093       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52094       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52095       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52096                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52097                          Cmp1.getValue(1));
52098     }
52099   }
52100 
52101   // (cmp Z, 1) sets the carry flag if Z is 0.
52102   SDValue One = DAG.getConstant(1, DL, ZVT);
52103   SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52104   SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52105 
52106   // Add the flags type for ADC/SBB nodes.
52107   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52108 
52109   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52110   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52111   if (CC == X86::COND_NE)
52112     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52113                        DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
52114 
52115   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
52116   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
52117   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52118                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52119 }
52120 
52121 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
52122                             const SDLoc &DL, EVT VT,
52123                             const X86Subtarget &Subtarget) {
52124   // Example of pattern we try to detect:
52125   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
52126   //(add (build_vector (extract_elt t, 0),
52127   //                   (extract_elt t, 2),
52128   //                   (extract_elt t, 4),
52129   //                   (extract_elt t, 6)),
52130   //     (build_vector (extract_elt t, 1),
52131   //                   (extract_elt t, 3),
52132   //                   (extract_elt t, 5),
52133   //                   (extract_elt t, 7)))
52134 
52135   if (!Subtarget.hasSSE2())
52136     return SDValue();
52137 
52138   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
52139       Op1.getOpcode() != ISD::BUILD_VECTOR)
52140     return SDValue();
52141 
52142   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
52143       VT.getVectorNumElements() < 4 ||
52144       !isPowerOf2_32(VT.getVectorNumElements()))
52145     return SDValue();
52146 
52147   // Check if one of Op0,Op1 is of the form:
52148   // (build_vector (extract_elt Mul, 0),
52149   //               (extract_elt Mul, 2),
52150   //               (extract_elt Mul, 4),
52151   //                   ...
52152   // the other is of the form:
52153   // (build_vector (extract_elt Mul, 1),
52154   //               (extract_elt Mul, 3),
52155   //               (extract_elt Mul, 5),
52156   //                   ...
52157   // and identify Mul.
52158   SDValue Mul;
52159   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
52160     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
52161             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
52162     // TODO: Be more tolerant to undefs.
52163     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52164         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52165         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52166         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52167       return SDValue();
52168     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
52169     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
52170     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
52171     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
52172     if (!Const0L || !Const1L || !Const0H || !Const1H)
52173       return SDValue();
52174     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
52175              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
52176     // Commutativity of mul allows factors of a product to reorder.
52177     if (Idx0L > Idx1L)
52178       std::swap(Idx0L, Idx1L);
52179     if (Idx0H > Idx1H)
52180       std::swap(Idx0H, Idx1H);
52181     // Commutativity of add allows pairs of factors to reorder.
52182     if (Idx0L > Idx0H) {
52183       std::swap(Idx0L, Idx0H);
52184       std::swap(Idx1L, Idx1H);
52185     }
52186     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
52187         Idx1H != 2 * i + 3)
52188       return SDValue();
52189     if (!Mul) {
52190       // First time an extract_elt's source vector is visited. Must be a MUL
52191       // with 2X number of vector elements than the BUILD_VECTOR.
52192       // Both extracts must be from same MUL.
52193       Mul = Op0L->getOperand(0);
52194       if (Mul->getOpcode() != ISD::MUL ||
52195           Mul.getValueType().getVectorNumElements() != 2 * e)
52196         return SDValue();
52197     }
52198     // Check that the extract is from the same MUL previously seen.
52199     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
52200         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
52201       return SDValue();
52202   }
52203 
52204   // Check if the Mul source can be safely shrunk.
52205   ShrinkMode Mode;
52206   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
52207       Mode == ShrinkMode::MULU16)
52208     return SDValue();
52209 
52210   EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52211                                  VT.getVectorNumElements() * 2);
52212   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
52213   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
52214 
52215   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52216                          ArrayRef<SDValue> Ops) {
52217     EVT InVT = Ops[0].getValueType();
52218     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
52219     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
52220                                  InVT.getVectorNumElements() / 2);
52221     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
52222   };
52223   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
52224 }
52225 
52226 // Attempt to turn this pattern into PMADDWD.
52227 // (add (mul (sext (build_vector)), (sext (build_vector))),
52228 //      (mul (sext (build_vector)), (sext (build_vector)))
52229 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
52230                               const SDLoc &DL, EVT VT,
52231                               const X86Subtarget &Subtarget) {
52232   if (!Subtarget.hasSSE2())
52233     return SDValue();
52234 
52235   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52236     return SDValue();
52237 
52238   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
52239       VT.getVectorNumElements() < 4 ||
52240       !isPowerOf2_32(VT.getVectorNumElements()))
52241     return SDValue();
52242 
52243   SDValue N00 = N0.getOperand(0);
52244   SDValue N01 = N0.getOperand(1);
52245   SDValue N10 = N1.getOperand(0);
52246   SDValue N11 = N1.getOperand(1);
52247 
52248   // All inputs need to be sign extends.
52249   // TODO: Support ZERO_EXTEND from known positive?
52250   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
52251       N01.getOpcode() != ISD::SIGN_EXTEND ||
52252       N10.getOpcode() != ISD::SIGN_EXTEND ||
52253       N11.getOpcode() != ISD::SIGN_EXTEND)
52254     return SDValue();
52255 
52256   // Peek through the extends.
52257   N00 = N00.getOperand(0);
52258   N01 = N01.getOperand(0);
52259   N10 = N10.getOperand(0);
52260   N11 = N11.getOperand(0);
52261 
52262   // Must be extending from vXi16.
52263   EVT InVT = N00.getValueType();
52264   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
52265       N10.getValueType() != InVT || N11.getValueType() != InVT)
52266     return SDValue();
52267 
52268   // All inputs should be build_vectors.
52269   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52270       N01.getOpcode() != ISD::BUILD_VECTOR ||
52271       N10.getOpcode() != ISD::BUILD_VECTOR ||
52272       N11.getOpcode() != ISD::BUILD_VECTOR)
52273     return SDValue();
52274 
52275   // For each element, we need to ensure we have an odd element from one vector
52276   // multiplied by the odd element of another vector and the even element from
52277   // one of the same vectors being multiplied by the even element from the
52278   // other vector. So we need to make sure for each element i, this operator
52279   // is being performed:
52280   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52281   SDValue In0, In1;
52282   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
52283     SDValue N00Elt = N00.getOperand(i);
52284     SDValue N01Elt = N01.getOperand(i);
52285     SDValue N10Elt = N10.getOperand(i);
52286     SDValue N11Elt = N11.getOperand(i);
52287     // TODO: Be more tolerant to undefs.
52288     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52289         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52290         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52291         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52292       return SDValue();
52293     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52294     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52295     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52296     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52297     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52298       return SDValue();
52299     unsigned IdxN00 = ConstN00Elt->getZExtValue();
52300     unsigned IdxN01 = ConstN01Elt->getZExtValue();
52301     unsigned IdxN10 = ConstN10Elt->getZExtValue();
52302     unsigned IdxN11 = ConstN11Elt->getZExtValue();
52303     // Add is commutative so indices can be reordered.
52304     if (IdxN00 > IdxN10) {
52305       std::swap(IdxN00, IdxN10);
52306       std::swap(IdxN01, IdxN11);
52307     }
52308     // N0 indices be the even element. N1 indices must be the next odd element.
52309     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52310         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52311       return SDValue();
52312     SDValue N00In = N00Elt.getOperand(0);
52313     SDValue N01In = N01Elt.getOperand(0);
52314     SDValue N10In = N10Elt.getOperand(0);
52315     SDValue N11In = N11Elt.getOperand(0);
52316 
52317     // First time we find an input capture it.
52318     if (!In0) {
52319       In0 = N00In;
52320       In1 = N01In;
52321 
52322       // The input vectors must be at least as wide as the output.
52323       // If they are larger than the output, we extract subvector below.
52324       if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
52325           In1.getValueSizeInBits() < VT.getSizeInBits())
52326         return SDValue();
52327     }
52328     // Mul is commutative so the input vectors can be in any order.
52329     // Canonicalize to make the compares easier.
52330     if (In0 != N00In)
52331       std::swap(N00In, N01In);
52332     if (In0 != N10In)
52333       std::swap(N10In, N11In);
52334     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
52335       return SDValue();
52336   }
52337 
52338   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52339                          ArrayRef<SDValue> Ops) {
52340     EVT OpVT = Ops[0].getValueType();
52341     assert(OpVT.getScalarType() == MVT::i16 &&
52342            "Unexpected scalar element type");
52343     assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
52344     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
52345                                  OpVT.getVectorNumElements() / 2);
52346     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
52347   };
52348 
52349   // If the output is narrower than an input, extract the low part of the input
52350   // vector.
52351   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52352                                VT.getVectorNumElements() * 2);
52353   if (OutVT16.bitsLT(In0.getValueType())) {
52354     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
52355                       DAG.getIntPtrConstant(0, DL));
52356   }
52357   if (OutVT16.bitsLT(In1.getValueType())) {
52358     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
52359                       DAG.getIntPtrConstant(0, DL));
52360   }
52361   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
52362                           PMADDBuilder);
52363 }
52364 
52365 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
52366 // If upper element in each pair of both VPMADDWD are zero then we can merge
52367 // the operand elements and use the implicit add of VPMADDWD.
52368 // TODO: Add support for VPMADDUBSW (which isn't commutable).
52369 static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
52370                                    const SDLoc &DL, EVT VT) {
52371   if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
52372     return SDValue();
52373 
52374   // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
52375   if (VT.getSizeInBits() > 128)
52376     return SDValue();
52377 
52378   unsigned NumElts = VT.getVectorNumElements();
52379   MVT OpVT = N0.getOperand(0).getSimpleValueType();
52380   APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
52381   APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
52382 
52383   bool Op0HiZero =
52384       DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
52385       DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
52386   bool Op1HiZero =
52387       DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
52388       DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
52389 
52390   // TODO: Check for zero lower elements once we have actual codegen that
52391   // creates them.
52392   if (!Op0HiZero || !Op1HiZero)
52393     return SDValue();
52394 
52395   // Create a shuffle mask packing the lower elements from each VPMADDWD.
52396   SmallVector<int> Mask;
52397   for (int i = 0; i != (int)NumElts; ++i) {
52398     Mask.push_back(2 * i);
52399     Mask.push_back(2 * (i + NumElts));
52400   }
52401 
52402   SDValue LHS =
52403       DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
52404   SDValue RHS =
52405       DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
52406   return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
52407 }
52408 
52409 /// CMOV of constants requires materializing constant operands in registers.
52410 /// Try to fold those constants into an 'add' instruction to reduce instruction
52411 /// count. We do this with CMOV rather the generic 'select' because there are
52412 /// earlier folds that may be used to turn select-of-constants into logic hacks.
52413 static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
52414   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
52415   // better because we eliminate 1-2 instructions. This transform is still
52416   // an improvement without zero operands because we trade 2 move constants and
52417   // 1 add for 2 adds (LEA) as long as the constants can be represented as
52418   // immediate asm operands (fit in 32-bits).
52419   auto isSuitableCmov = [](SDValue V) {
52420     if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
52421       return false;
52422     if (!isa<ConstantSDNode>(V.getOperand(0)) ||
52423         !isa<ConstantSDNode>(V.getOperand(1)))
52424       return false;
52425     return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
52426            (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
52427             V.getConstantOperandAPInt(1).isSignedIntN(32));
52428   };
52429 
52430   // Match an appropriate CMOV as the first operand of the add.
52431   SDValue Cmov = N->getOperand(0);
52432   SDValue OtherOp = N->getOperand(1);
52433   if (!isSuitableCmov(Cmov))
52434     std::swap(Cmov, OtherOp);
52435   if (!isSuitableCmov(Cmov))
52436     return SDValue();
52437 
52438   EVT VT = N->getValueType(0);
52439   SDLoc DL(N);
52440   SDValue FalseOp = Cmov.getOperand(0);
52441   SDValue TrueOp = Cmov.getOperand(1);
52442 
52443   // We will push the add through the select, but we can potentially do better
52444   // if we know there is another add in the sequence and this is pointer math.
52445   // In that case, we can absorb an add into the trailing memory op and avoid
52446   // a 3-operand LEA which is likely slower than a 2-operand LEA.
52447   // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
52448   if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
52449       !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
52450       all_of(N->uses(), [&](SDNode *Use) {
52451         auto *MemNode = dyn_cast<MemSDNode>(Use);
52452         return MemNode && MemNode->getBasePtr().getNode() == N;
52453       })) {
52454     // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
52455     // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
52456     //       it is possible that choosing op1 might be better.
52457     SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
52458     FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
52459     TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
52460     Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
52461                        Cmov.getOperand(2), Cmov.getOperand(3));
52462     return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
52463   }
52464 
52465   // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
52466   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
52467   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
52468   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
52469                      Cmov.getOperand(3));
52470 }
52471 
52472 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
52473                           TargetLowering::DAGCombinerInfo &DCI,
52474                           const X86Subtarget &Subtarget) {
52475   EVT VT = N->getValueType(0);
52476   SDValue Op0 = N->getOperand(0);
52477   SDValue Op1 = N->getOperand(1);
52478   SDLoc DL(N);
52479 
52480   if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
52481     return Select;
52482 
52483   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
52484     return MAdd;
52485   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
52486     return MAdd;
52487   if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
52488     return MAdd;
52489 
52490   // Try to synthesize horizontal adds from adds of shuffles.
52491   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
52492     return V;
52493 
52494   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
52495   // (sub Y, (sext (vXi1 X))).
52496   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
52497   // generic DAG combine without a legal type check, but adding this there
52498   // caused regressions.
52499   if (VT.isVector()) {
52500     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52501     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
52502         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52503         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
52504       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
52505       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
52506     }
52507 
52508     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
52509         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52510         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
52511       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
52512       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
52513     }
52514   }
52515 
52516   return combineAddOrSubToADCOrSBB(N, DAG);
52517 }
52518 
52519 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
52520 // condition comes from the subtract node that produced -X. This matches the
52521 // cmov expansion for absolute value. By swapping the operands we convert abs
52522 // to nabs.
52523 static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
52524   SDValue N0 = N->getOperand(0);
52525   SDValue N1 = N->getOperand(1);
52526 
52527   if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
52528     return SDValue();
52529 
52530   X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
52531   if (CC != X86::COND_S && CC != X86::COND_NS)
52532     return SDValue();
52533 
52534   // Condition should come from a negate operation.
52535   SDValue Cond = N1.getOperand(3);
52536   if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
52537     return SDValue();
52538   assert(Cond.getResNo() == 1 && "Unexpected result number");
52539 
52540   // Get the X and -X from the negate.
52541   SDValue NegX = Cond.getValue(0);
52542   SDValue X = Cond.getOperand(1);
52543 
52544   SDValue FalseOp = N1.getOperand(0);
52545   SDValue TrueOp = N1.getOperand(1);
52546 
52547   // Cmov operands should be X and NegX. Order doesn't matter.
52548   if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
52549     return SDValue();
52550 
52551   // Build a new CMOV with the operands swapped.
52552   SDLoc DL(N);
52553   MVT VT = N->getSimpleValueType(0);
52554   SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
52555                              N1.getOperand(2), Cond);
52556   // Convert sub to add.
52557   return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
52558 }
52559 
52560 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
52561                           TargetLowering::DAGCombinerInfo &DCI,
52562                           const X86Subtarget &Subtarget) {
52563   SDValue Op0 = N->getOperand(0);
52564   SDValue Op1 = N->getOperand(1);
52565 
52566   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
52567   auto IsNonOpaqueConstant = [&](SDValue Op) {
52568     if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
52569       if (auto *Cst = dyn_cast<ConstantSDNode>(C))
52570         return !Cst->isOpaque();
52571       return true;
52572     }
52573     return false;
52574   };
52575 
52576   // X86 can't encode an immediate LHS of a sub. See if we can push the
52577   // negation into a preceding instruction. If the RHS of the sub is a XOR with
52578   // one use and a constant, invert the immediate, saving one register.
52579   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
52580   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
52581       IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
52582     SDLoc DL(N);
52583     EVT VT = Op0.getValueType();
52584     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
52585                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
52586     SDValue NewAdd =
52587         DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
52588     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
52589   }
52590 
52591   if (SDValue V = combineSubABS(N, DAG))
52592     return V;
52593 
52594   // Try to synthesize horizontal subs from subs of shuffles.
52595   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
52596     return V;
52597 
52598   return combineAddOrSubToADCOrSBB(N, DAG);
52599 }
52600 
52601 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
52602                                     const X86Subtarget &Subtarget) {
52603   MVT VT = N->getSimpleValueType(0);
52604   SDLoc DL(N);
52605 
52606   if (N->getOperand(0) == N->getOperand(1)) {
52607     if (N->getOpcode() == X86ISD::PCMPEQ)
52608       return DAG.getConstant(-1, DL, VT);
52609     if (N->getOpcode() == X86ISD::PCMPGT)
52610       return DAG.getConstant(0, DL, VT);
52611   }
52612 
52613   return SDValue();
52614 }
52615 
52616 /// Helper that combines an array of subvector ops as if they were the operands
52617 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
52618 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
52619 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
52620                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
52621                                       TargetLowering::DAGCombinerInfo &DCI,
52622                                       const X86Subtarget &Subtarget) {
52623   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
52624   unsigned EltSizeInBits = VT.getScalarSizeInBits();
52625 
52626   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
52627     return DAG.getUNDEF(VT);
52628 
52629   if (llvm::all_of(Ops, [](SDValue Op) {
52630         return ISD::isBuildVectorAllZeros(Op.getNode());
52631       }))
52632     return getZeroVector(VT, Subtarget, DAG, DL);
52633 
52634   SDValue Op0 = Ops[0];
52635   bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
52636 
52637   // Repeated subvectors.
52638   if (IsSplat &&
52639       (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
52640     // If this broadcast is inserted into both halves, use a larger broadcast.
52641     if (Op0.getOpcode() == X86ISD::VBROADCAST)
52642       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
52643 
52644     // If this simple subvector or scalar/subvector broadcast_load is inserted
52645     // into both halves, use a larger broadcast_load. Update other uses to use
52646     // an extracted subvector.
52647     if (ISD::isNormalLoad(Op0.getNode()) ||
52648         Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
52649         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
52650       auto *Mem = cast<MemSDNode>(Op0);
52651       unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
52652                          ? X86ISD::VBROADCAST_LOAD
52653                          : X86ISD::SUBV_BROADCAST_LOAD;
52654       if (SDValue BcastLd =
52655               getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
52656         SDValue BcastSrc =
52657             extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
52658         DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
52659         return BcastLd;
52660       }
52661     }
52662 
52663     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
52664     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
52665         (Subtarget.hasAVX2() ||
52666          X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
52667                                               VT.getScalarType(), Subtarget)))
52668       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
52669                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
52670                                      Op0.getOperand(0),
52671                                      DAG.getIntPtrConstant(0, DL)));
52672 
52673     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
52674     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52675         (Subtarget.hasAVX2() ||
52676          (EltSizeInBits >= 32 &&
52677           X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
52678         Op0.getOperand(0).getValueType() == VT.getScalarType())
52679       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
52680 
52681     // concat_vectors(extract_subvector(broadcast(x)),
52682     //                extract_subvector(broadcast(x))) -> broadcast(x)
52683     if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52684         Op0.getOperand(0).getValueType() == VT) {
52685       if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
52686           Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
52687         return Op0.getOperand(0);
52688     }
52689   }
52690 
52691   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
52692   // Only concat of subvector high halves which vperm2x128 is best at.
52693   // TODO: This should go in combineX86ShufflesRecursively eventually.
52694   if (VT.is256BitVector() && Ops.size() == 2) {
52695     SDValue Src0 = peekThroughBitcasts(Ops[0]);
52696     SDValue Src1 = peekThroughBitcasts(Ops[1]);
52697     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52698         Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
52699       EVT SrcVT0 = Src0.getOperand(0).getValueType();
52700       EVT SrcVT1 = Src1.getOperand(0).getValueType();
52701       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
52702       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
52703       if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
52704           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
52705           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
52706         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
52707                            DAG.getBitcast(VT, Src0.getOperand(0)),
52708                            DAG.getBitcast(VT, Src1.getOperand(0)),
52709                            DAG.getTargetConstant(0x31, DL, MVT::i8));
52710       }
52711     }
52712   }
52713 
52714   // Repeated opcode.
52715   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
52716   // but it currently struggles with different vector widths.
52717   if (llvm::all_of(Ops, [Op0](SDValue Op) {
52718         return Op.getOpcode() == Op0.getOpcode();
52719       })) {
52720     auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
52721       SmallVector<SDValue> Subs;
52722       for (SDValue SubOp : SubOps)
52723         Subs.push_back(SubOp.getOperand(I));
52724       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
52725     };
52726 
52727     unsigned NumOps = Ops.size();
52728     switch (Op0.getOpcode()) {
52729     case X86ISD::VBROADCAST: {
52730       if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) {
52731             return Op.getOperand(0).getValueType().is128BitVector();
52732           }))
52733         return DAG.getNode(X86ISD::MOVDDUP, DL, VT,
52734                            ConcatSubOperand(VT, Ops, 0));
52735       break;
52736     }
52737     case X86ISD::MOVDDUP:
52738     case X86ISD::MOVSHDUP:
52739     case X86ISD::MOVSLDUP: {
52740       if (!IsSplat)
52741         return DAG.getNode(Op0.getOpcode(), DL, VT,
52742                            ConcatSubOperand(VT, Ops, 0));
52743       break;
52744     }
52745     case X86ISD::SHUFP: {
52746       // Add SHUFPD support if/when necessary.
52747       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
52748           llvm::all_of(Ops, [Op0](SDValue Op) {
52749             return Op.getOperand(2) == Op0.getOperand(2);
52750           })) {
52751         return DAG.getNode(Op0.getOpcode(), DL, VT,
52752                            ConcatSubOperand(VT, Ops, 0),
52753                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
52754       }
52755       break;
52756     }
52757     case X86ISD::PSHUFHW:
52758     case X86ISD::PSHUFLW:
52759     case X86ISD::PSHUFD:
52760       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
52761           Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
52762         return DAG.getNode(Op0.getOpcode(), DL, VT,
52763                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
52764       }
52765       LLVM_FALLTHROUGH;
52766     case X86ISD::VPERMILPI:
52767       if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
52768           Op0.getOperand(1) == Ops[1].getOperand(1)) {
52769         SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
52770         Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
52771                           Op0.getOperand(1));
52772         return DAG.getBitcast(VT, Res);
52773       }
52774       if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
52775         uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
52776         uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
52777         uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
52778         return DAG.getNode(Op0.getOpcode(), DL, VT,
52779                            ConcatSubOperand(VT, Ops, 0),
52780                            DAG.getTargetConstant(Idx, DL, MVT::i8));
52781       }
52782       break;
52783     case X86ISD::VPERMV3:
52784       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
52785         MVT OpVT = Op0.getSimpleValueType();
52786         int NumSrcElts = OpVT.getVectorNumElements();
52787         SmallVector<int, 64> ConcatMask;
52788         for (unsigned i = 0; i != NumOps; ++i) {
52789           SmallVector<int, 64> SubMask;
52790           SmallVector<SDValue, 2> SubOps;
52791           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
52792                                     SubMask))
52793             break;
52794           for (int M : SubMask) {
52795             if (0 <= M) {
52796               M += M < NumSrcElts ? 0 : NumSrcElts;
52797               M += i * NumSrcElts;
52798             }
52799             ConcatMask.push_back(M);
52800           }
52801         }
52802         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
52803           SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
52804                                           Ops[1].getOperand(0), DAG, DL);
52805           SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
52806                                           Ops[1].getOperand(2), DAG, DL);
52807           MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
52808           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
52809           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
52810           return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
52811         }
52812       }
52813       break;
52814     case X86ISD::VSHLI:
52815     case X86ISD::VSRLI:
52816       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
52817       // TODO: Move this to LowerShiftByScalarImmediate?
52818       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
52819           llvm::all_of(Ops, [](SDValue Op) {
52820             return Op.getConstantOperandAPInt(1) == 32;
52821           })) {
52822         SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
52823         SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
52824         if (Op0.getOpcode() == X86ISD::VSHLI) {
52825           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
52826                                      {8, 0, 8, 2, 8, 4, 8, 6});
52827         } else {
52828           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
52829                                      {1, 8, 3, 8, 5, 8, 7, 8});
52830         }
52831         return DAG.getBitcast(VT, Res);
52832       }
52833       LLVM_FALLTHROUGH;
52834     case X86ISD::VSRAI:
52835     case X86ISD::VSHL:
52836     case X86ISD::VSRL:
52837     case X86ISD::VSRA:
52838       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
52839            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
52840             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
52841           llvm::all_of(Ops, [Op0](SDValue Op) {
52842             return Op0.getOperand(1) == Op.getOperand(1);
52843           })) {
52844         return DAG.getNode(Op0.getOpcode(), DL, VT,
52845                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
52846       }
52847       break;
52848     case X86ISD::VPERMI:
52849     case X86ISD::VROTLI:
52850     case X86ISD::VROTRI:
52851       if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
52852           llvm::all_of(Ops, [Op0](SDValue Op) {
52853             return Op0.getOperand(1) == Op.getOperand(1);
52854           })) {
52855         return DAG.getNode(Op0.getOpcode(), DL, VT,
52856                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
52857       }
52858       break;
52859     case ISD::AND:
52860     case ISD::OR:
52861     case ISD::XOR:
52862     case X86ISD::ANDNP:
52863       // TODO: Add 256-bit support.
52864       if (!IsSplat && VT.is512BitVector()) {
52865         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
52866         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
52867                                  NumOps * SrcVT.getVectorNumElements());
52868         return DAG.getNode(Op0.getOpcode(), DL, VT,
52869                            ConcatSubOperand(SrcVT, Ops, 0),
52870                            ConcatSubOperand(SrcVT, Ops, 1));
52871       }
52872       break;
52873     case X86ISD::HADD:
52874     case X86ISD::HSUB:
52875     case X86ISD::FHADD:
52876     case X86ISD::FHSUB:
52877     case X86ISD::PACKSS:
52878     case X86ISD::PACKUS:
52879       if (!IsSplat && VT.is256BitVector() &&
52880           (VT.isFloatingPoint() || Subtarget.hasInt256())) {
52881         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
52882         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
52883                                  NumOps * SrcVT.getVectorNumElements());
52884         return DAG.getNode(Op0.getOpcode(), DL, VT,
52885                            ConcatSubOperand(SrcVT, Ops, 0),
52886                            ConcatSubOperand(SrcVT, Ops, 1));
52887       }
52888       break;
52889     case X86ISD::PALIGNR:
52890       if (!IsSplat &&
52891           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
52892            (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
52893           llvm::all_of(Ops, [Op0](SDValue Op) {
52894             return Op0.getOperand(2) == Op.getOperand(2);
52895           })) {
52896         return DAG.getNode(Op0.getOpcode(), DL, VT,
52897                            ConcatSubOperand(VT, Ops, 0),
52898                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
52899       }
52900       break;
52901     }
52902   }
52903 
52904   // Fold subvector loads into one.
52905   // If needed, look through bitcasts to get to the load.
52906   if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
52907     bool Fast;
52908     const X86TargetLowering *TLI = Subtarget.getTargetLowering();
52909     if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52910                                 *FirstLd->getMemOperand(), &Fast) &&
52911         Fast) {
52912       if (SDValue Ld =
52913               EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
52914         return Ld;
52915     }
52916   }
52917 
52918   return SDValue();
52919 }
52920 
52921 static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
52922                                     TargetLowering::DAGCombinerInfo &DCI,
52923                                     const X86Subtarget &Subtarget) {
52924   EVT VT = N->getValueType(0);
52925   EVT SrcVT = N->getOperand(0).getValueType();
52926   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52927 
52928   // Don't do anything for i1 vectors.
52929   if (VT.getVectorElementType() == MVT::i1)
52930     return SDValue();
52931 
52932   if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
52933     SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
52934     if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
52935                                            DCI, Subtarget))
52936       return R;
52937   }
52938 
52939   return SDValue();
52940 }
52941 
52942 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
52943                                       TargetLowering::DAGCombinerInfo &DCI,
52944                                       const X86Subtarget &Subtarget) {
52945   if (DCI.isBeforeLegalizeOps())
52946     return SDValue();
52947 
52948   MVT OpVT = N->getSimpleValueType(0);
52949 
52950   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
52951 
52952   SDLoc dl(N);
52953   SDValue Vec = N->getOperand(0);
52954   SDValue SubVec = N->getOperand(1);
52955 
52956   uint64_t IdxVal = N->getConstantOperandVal(2);
52957   MVT SubVecVT = SubVec.getSimpleValueType();
52958 
52959   if (Vec.isUndef() && SubVec.isUndef())
52960     return DAG.getUNDEF(OpVT);
52961 
52962   // Inserting undefs/zeros into zeros/undefs is a zero vector.
52963   if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
52964       (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
52965     return getZeroVector(OpVT, Subtarget, DAG, dl);
52966 
52967   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
52968     // If we're inserting into a zero vector and then into a larger zero vector,
52969     // just insert into the larger zero vector directly.
52970     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
52971         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
52972       uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
52973       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
52974                          getZeroVector(OpVT, Subtarget, DAG, dl),
52975                          SubVec.getOperand(1),
52976                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
52977     }
52978 
52979     // If we're inserting into a zero vector and our input was extracted from an
52980     // insert into a zero vector of the same type and the extraction was at
52981     // least as large as the original insertion. Just insert the original
52982     // subvector into a zero vector.
52983     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
52984         isNullConstant(SubVec.getOperand(1)) &&
52985         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
52986       SDValue Ins = SubVec.getOperand(0);
52987       if (isNullConstant(Ins.getOperand(2)) &&
52988           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
52989           Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
52990               SubVecVT.getFixedSizeInBits())
52991         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
52992                            getZeroVector(OpVT, Subtarget, DAG, dl),
52993                            Ins.getOperand(1), N->getOperand(2));
52994     }
52995   }
52996 
52997   // Stop here if this is an i1 vector.
52998   if (IsI1Vector)
52999     return SDValue();
53000 
53001   // If this is an insert of an extract, combine to a shuffle. Don't do this
53002   // if the insert or extract can be represented with a subregister operation.
53003   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53004       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
53005       (IdxVal != 0 ||
53006        !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
53007     int ExtIdxVal = SubVec.getConstantOperandVal(1);
53008     if (ExtIdxVal != 0) {
53009       int VecNumElts = OpVT.getVectorNumElements();
53010       int SubVecNumElts = SubVecVT.getVectorNumElements();
53011       SmallVector<int, 64> Mask(VecNumElts);
53012       // First create an identity shuffle mask.
53013       for (int i = 0; i != VecNumElts; ++i)
53014         Mask[i] = i;
53015       // Now insert the extracted portion.
53016       for (int i = 0; i != SubVecNumElts; ++i)
53017         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
53018 
53019       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
53020     }
53021   }
53022 
53023   // Match concat_vector style patterns.
53024   SmallVector<SDValue, 2> SubVectorOps;
53025   if (collectConcatOps(N, SubVectorOps)) {
53026     if (SDValue Fold =
53027             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
53028       return Fold;
53029 
53030     // If we're inserting all zeros into the upper half, change this to
53031     // a concat with zero. We will match this to a move
53032     // with implicit upper bit zeroing during isel.
53033     // We do this here because we don't want combineConcatVectorOps to
53034     // create INSERT_SUBVECTOR from CONCAT_VECTORS.
53035     if (SubVectorOps.size() == 2 &&
53036         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
53037       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
53038                          getZeroVector(OpVT, Subtarget, DAG, dl),
53039                          SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
53040   }
53041 
53042   // If this is a broadcast insert into an upper undef, use a larger broadcast.
53043   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
53044     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
53045 
53046   // If this is a broadcast load inserted into an upper undef, use a larger
53047   // broadcast load.
53048   if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
53049       SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
53050     auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
53051     SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
53052     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
53053     SDValue BcastLd =
53054         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
53055                                 MemIntr->getMemoryVT(),
53056                                 MemIntr->getMemOperand());
53057     DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
53058     return BcastLd;
53059   }
53060 
53061   // If we're splatting the lower half subvector of a full vector load into the
53062   // upper half, attempt to create a subvector broadcast.
53063   if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
53064       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
53065     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
53066     auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
53067     if (VecLd && SubLd &&
53068         DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
53069                                            SubVec.getValueSizeInBits() / 8, 0))
53070       return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
53071                                SubLd, 0, DAG);
53072   }
53073 
53074   return SDValue();
53075 }
53076 
53077 /// If we are extracting a subvector of a vector select and the select condition
53078 /// is composed of concatenated vectors, try to narrow the select width. This
53079 /// is a common pattern for AVX1 integer code because 256-bit selects may be
53080 /// legal, but there is almost no integer math/logic available for 256-bit.
53081 /// This function should only be called with legal types (otherwise, the calls
53082 /// to get simple value types will assert).
53083 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
53084   SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
53085   SmallVector<SDValue, 4> CatOps;
53086   if (Sel.getOpcode() != ISD::VSELECT ||
53087       !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
53088     return SDValue();
53089 
53090   // Note: We assume simple value types because this should only be called with
53091   //       legal operations/types.
53092   // TODO: This can be extended to handle extraction to 256-bits.
53093   MVT VT = Ext->getSimpleValueType(0);
53094   if (!VT.is128BitVector())
53095     return SDValue();
53096 
53097   MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
53098   if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
53099     return SDValue();
53100 
53101   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
53102   MVT SelVT = Sel.getSimpleValueType();
53103   assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
53104          "Unexpected vector type with legal operations");
53105 
53106   unsigned SelElts = SelVT.getVectorNumElements();
53107   unsigned CastedElts = WideVT.getVectorNumElements();
53108   unsigned ExtIdx = Ext->getConstantOperandVal(1);
53109   if (SelElts % CastedElts == 0) {
53110     // The select has the same or more (narrower) elements than the extract
53111     // operand. The extraction index gets scaled by that factor.
53112     ExtIdx *= (SelElts / CastedElts);
53113   } else if (CastedElts % SelElts == 0) {
53114     // The select has less (wider) elements than the extract operand. Make sure
53115     // that the extraction index can be divided evenly.
53116     unsigned IndexDivisor = CastedElts / SelElts;
53117     if (ExtIdx % IndexDivisor != 0)
53118       return SDValue();
53119     ExtIdx /= IndexDivisor;
53120   } else {
53121     llvm_unreachable("Element count of simple vector types are not divisible?");
53122   }
53123 
53124   unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
53125   unsigned NarrowElts = SelElts / NarrowingFactor;
53126   MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
53127   SDLoc DL(Ext);
53128   SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
53129   SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
53130   SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
53131   SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
53132   return DAG.getBitcast(VT, NarrowSel);
53133 }
53134 
53135 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
53136                                        TargetLowering::DAGCombinerInfo &DCI,
53137                                        const X86Subtarget &Subtarget) {
53138   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
53139   // eventually get combined/lowered into ANDNP) with a concatenated operand,
53140   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
53141   // We let generic combining take over from there to simplify the
53142   // insert/extract and 'not'.
53143   // This pattern emerges during AVX1 legalization. We handle it before lowering
53144   // to avoid complications like splitting constant vector loads.
53145 
53146   // Capture the original wide type in the likely case that we need to bitcast
53147   // back to this type.
53148   if (!N->getValueType(0).isSimple())
53149     return SDValue();
53150 
53151   MVT VT = N->getSimpleValueType(0);
53152   SDValue InVec = N->getOperand(0);
53153   unsigned IdxVal = N->getConstantOperandVal(1);
53154   SDValue InVecBC = peekThroughBitcasts(InVec);
53155   EVT InVecVT = InVec.getValueType();
53156   unsigned SizeInBits = VT.getSizeInBits();
53157   unsigned InSizeInBits = InVecVT.getSizeInBits();
53158   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53159 
53160   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
53161       TLI.isTypeLegal(InVecVT) &&
53162       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
53163     auto isConcatenatedNot = [](SDValue V) {
53164       V = peekThroughBitcasts(V);
53165       if (!isBitwiseNot(V))
53166         return false;
53167       SDValue NotOp = V->getOperand(0);
53168       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
53169     };
53170     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
53171         isConcatenatedNot(InVecBC.getOperand(1))) {
53172       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
53173       SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
53174       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
53175                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
53176     }
53177   }
53178 
53179   if (DCI.isBeforeLegalizeOps())
53180     return SDValue();
53181 
53182   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
53183     return V;
53184 
53185   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
53186     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
53187 
53188   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
53189     if (VT.getScalarType() == MVT::i1)
53190       return DAG.getConstant(1, SDLoc(N), VT);
53191     return getOnesVector(VT, DAG, SDLoc(N));
53192   }
53193 
53194   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
53195     return DAG.getBuildVector(
53196         VT, SDLoc(N),
53197         InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
53198 
53199   // If we are extracting from an insert into a zero vector, replace with a
53200   // smaller insert into zero if we don't access less than the original
53201   // subvector. Don't do this for i1 vectors.
53202   if (VT.getVectorElementType() != MVT::i1 &&
53203       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
53204       InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
53205       ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
53206       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
53207     SDLoc DL(N);
53208     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
53209                        getZeroVector(VT, Subtarget, DAG, DL),
53210                        InVec.getOperand(1), InVec.getOperand(2));
53211   }
53212 
53213   // If we're extracting an upper subvector from a broadcast we should just
53214   // extract the lowest subvector instead which should allow
53215   // SimplifyDemandedVectorElts do more simplifications.
53216   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
53217                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
53218                       DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
53219     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
53220 
53221   // If we're extracting a broadcasted subvector, just use the lowest subvector.
53222   if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53223       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
53224     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
53225 
53226   // Attempt to extract from the source of a shuffle vector.
53227   if ((InSizeInBits % SizeInBits) == 0 &&
53228       (IdxVal % VT.getVectorNumElements()) == 0) {
53229     SmallVector<int, 32> ShuffleMask;
53230     SmallVector<int, 32> ScaledMask;
53231     SmallVector<SDValue, 2> ShuffleInputs;
53232     unsigned NumSubVecs = InSizeInBits / SizeInBits;
53233     // Decode the shuffle mask and scale it so its shuffling subvectors.
53234     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
53235         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
53236       unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
53237       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
53238         return DAG.getUNDEF(VT);
53239       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
53240         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
53241       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
53242       if (Src.getValueSizeInBits() == InSizeInBits) {
53243         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
53244         unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
53245         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
53246                                 SDLoc(N), SizeInBits);
53247       }
53248     }
53249   }
53250 
53251   // If we're extracting the lowest subvector and we're the only user,
53252   // we may be able to perform this with a smaller vector width.
53253   unsigned InOpcode = InVec.getOpcode();
53254   if (IdxVal == 0 && InVec.hasOneUse()) {
53255     if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
53256       // v2f64 CVTDQ2PD(v4i32).
53257       if (InOpcode == ISD::SINT_TO_FP &&
53258           InVec.getOperand(0).getValueType() == MVT::v4i32) {
53259         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
53260       }
53261       // v2f64 CVTUDQ2PD(v4i32).
53262       if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
53263           InVec.getOperand(0).getValueType() == MVT::v4i32) {
53264         return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
53265       }
53266       // v2f64 CVTPS2PD(v4f32).
53267       if (InOpcode == ISD::FP_EXTEND &&
53268           InVec.getOperand(0).getValueType() == MVT::v4f32) {
53269         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
53270       }
53271     }
53272     if ((InOpcode == ISD::ANY_EXTEND ||
53273          InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
53274          InOpcode == ISD::ZERO_EXTEND ||
53275          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
53276          InOpcode == ISD::SIGN_EXTEND ||
53277          InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
53278         (SizeInBits == 128 || SizeInBits == 256) &&
53279         InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
53280       SDLoc DL(N);
53281       SDValue Ext = InVec.getOperand(0);
53282       if (Ext.getValueSizeInBits() > SizeInBits)
53283         Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
53284       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
53285       return DAG.getNode(ExtOp, DL, VT, Ext);
53286     }
53287     if (InOpcode == ISD::VSELECT &&
53288         InVec.getOperand(0).getValueType().is256BitVector() &&
53289         InVec.getOperand(1).getValueType().is256BitVector() &&
53290         InVec.getOperand(2).getValueType().is256BitVector()) {
53291       SDLoc DL(N);
53292       SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
53293       SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
53294       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
53295       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
53296     }
53297     if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
53298         (VT.is128BitVector() || VT.is256BitVector())) {
53299       SDLoc DL(N);
53300       SDValue InVecSrc = InVec.getOperand(0);
53301       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
53302       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
53303       return DAG.getNode(InOpcode, DL, VT, Ext);
53304     }
53305   }
53306 
53307   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
53308   // as this is very likely to fold into a shuffle/truncation.
53309   if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
53310       InVecVT.getScalarSizeInBits() == 64 &&
53311       InVec.getConstantOperandAPInt(1) == 32) {
53312     SDLoc DL(N);
53313     SDValue Ext =
53314         extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
53315     return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
53316   }
53317 
53318   return SDValue();
53319 }
53320 
53321 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
53322   EVT VT = N->getValueType(0);
53323   SDValue Src = N->getOperand(0);
53324   SDLoc DL(N);
53325 
53326   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
53327   // This occurs frequently in our masked scalar intrinsic code and our
53328   // floating point select lowering with AVX512.
53329   // TODO: SimplifyDemandedBits instead?
53330   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
53331     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
53332       if (C->getAPIntValue().isOne())
53333         return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
53334                            Src.getOperand(0));
53335 
53336   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
53337   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53338       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
53339       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
53340     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
53341       if (C->isZero())
53342         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
53343                            Src.getOperand(1));
53344 
53345   // Reduce v2i64 to v4i32 if we don't need the upper bits.
53346   // TODO: Move to DAGCombine/SimplifyDemandedBits?
53347   if (VT == MVT::v2i64 || VT == MVT::v2f64) {
53348     auto IsAnyExt64 = [](SDValue Op) {
53349       if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
53350         return SDValue();
53351       if (Op.getOpcode() == ISD::ANY_EXTEND &&
53352           Op.getOperand(0).getScalarValueSizeInBits() <= 32)
53353         return Op.getOperand(0);
53354       if (auto *Ld = dyn_cast<LoadSDNode>(Op))
53355         if (Ld->getExtensionType() == ISD::EXTLOAD &&
53356             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
53357           return Op;
53358       return SDValue();
53359     };
53360     if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
53361       return DAG.getBitcast(
53362           VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
53363                           DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
53364   }
53365 
53366   // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
53367   if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
53368       Src.getOperand(0).getValueType() == MVT::x86mmx)
53369     return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
53370 
53371   // See if we're broadcasting the scalar value, in which case just reuse that.
53372   // Ensure the same SDValue from the SDNode use is being used.
53373   if (VT.getScalarType() == Src.getValueType())
53374     for (SDNode *User : Src->uses())
53375       if (User->getOpcode() == X86ISD::VBROADCAST &&
53376           Src == User->getOperand(0)) {
53377         unsigned SizeInBits = VT.getFixedSizeInBits();
53378         unsigned BroadcastSizeInBits =
53379             User->getValueSizeInBits(0).getFixedSize();
53380         if (BroadcastSizeInBits == SizeInBits)
53381           return SDValue(User, 0);
53382         if (BroadcastSizeInBits > SizeInBits)
53383           return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
53384         // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
53385         // coverage.
53386       }
53387 
53388   return SDValue();
53389 }
53390 
53391 // Simplify PMULDQ and PMULUDQ operations.
53392 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
53393                              TargetLowering::DAGCombinerInfo &DCI,
53394                              const X86Subtarget &Subtarget) {
53395   SDValue LHS = N->getOperand(0);
53396   SDValue RHS = N->getOperand(1);
53397 
53398   // Canonicalize constant to RHS.
53399   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
53400       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
53401     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
53402 
53403   // Multiply by zero.
53404   // Don't return RHS as it may contain UNDEFs.
53405   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
53406     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
53407 
53408   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
53409   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53410   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
53411     return SDValue(N, 0);
53412 
53413   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
53414   // convert it to any_extend_invec, due to the LegalOperations check, do the
53415   // conversion directly to a vector shuffle manually. This exposes combine
53416   // opportunities missed by combineEXTEND_VECTOR_INREG not calling
53417   // combineX86ShufflesRecursively on SSE4.1 targets.
53418   // FIXME: This is basically a hack around several other issues related to
53419   // ANY_EXTEND_VECTOR_INREG.
53420   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
53421       (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
53422        LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
53423       LHS.getOperand(0).getValueType() == MVT::v4i32) {
53424     SDLoc dl(N);
53425     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
53426                                LHS.getOperand(0), { 0, -1, 1, -1 });
53427     LHS = DAG.getBitcast(MVT::v2i64, LHS);
53428     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
53429   }
53430   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
53431       (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
53432        RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
53433       RHS.getOperand(0).getValueType() == MVT::v4i32) {
53434     SDLoc dl(N);
53435     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
53436                                RHS.getOperand(0), { 0, -1, 1, -1 });
53437     RHS = DAG.getBitcast(MVT::v2i64, RHS);
53438     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
53439   }
53440 
53441   return SDValue();
53442 }
53443 
53444 // Simplify VPMADDUBSW/VPMADDWD operations.
53445 static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
53446                              TargetLowering::DAGCombinerInfo &DCI) {
53447   EVT VT = N->getValueType(0);
53448   SDValue LHS = N->getOperand(0);
53449   SDValue RHS = N->getOperand(1);
53450 
53451   // Multiply by zero.
53452   // Don't return LHS/RHS as it may contain UNDEFs.
53453   if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
53454       ISD::isBuildVectorAllZeros(RHS.getNode()))
53455     return DAG.getConstant(0, SDLoc(N), VT);
53456 
53457   APInt KnownUndef, KnownZero;
53458   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53459   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53460   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
53461                                      KnownZero, DCI))
53462     return SDValue(N, 0);
53463 
53464   return SDValue();
53465 }
53466 
53467 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
53468                                           TargetLowering::DAGCombinerInfo &DCI,
53469                                           const X86Subtarget &Subtarget) {
53470   EVT VT = N->getValueType(0);
53471   SDValue In = N->getOperand(0);
53472   unsigned Opcode = N->getOpcode();
53473   unsigned InOpcode = In.getOpcode();
53474   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53475 
53476   // Try to merge vector loads and extend_inreg to an extload.
53477   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
53478       In.hasOneUse()) {
53479     auto *Ld = cast<LoadSDNode>(In);
53480     if (Ld->isSimple()) {
53481       MVT SVT = In.getSimpleValueType().getVectorElementType();
53482       ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
53483                                  ? ISD::SEXTLOAD
53484                                  : ISD::ZEXTLOAD;
53485       EVT MemVT = VT.changeVectorElementType(SVT);
53486       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
53487         SDValue Load =
53488             DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
53489                            Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
53490                            Ld->getMemOperand()->getFlags());
53491         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
53492         return Load;
53493       }
53494     }
53495   }
53496 
53497   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
53498   if (Opcode == InOpcode)
53499     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
53500 
53501   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
53502   // -> EXTEND_VECTOR_INREG(X).
53503   // TODO: Handle non-zero subvector indices.
53504   if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
53505       In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
53506       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
53507           In.getValueSizeInBits())
53508     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
53509 
53510   // Attempt to combine as a shuffle.
53511   // TODO: General ZERO_EXTEND_VECTOR_INREG support.
53512   if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
53513       (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
53514     SDValue Op(N, 0);
53515     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
53516       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
53517         return Res;
53518   }
53519 
53520   return SDValue();
53521 }
53522 
53523 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
53524                              TargetLowering::DAGCombinerInfo &DCI) {
53525   EVT VT = N->getValueType(0);
53526 
53527   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
53528     return DAG.getConstant(0, SDLoc(N), VT);
53529 
53530   APInt KnownUndef, KnownZero;
53531   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53532   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53533   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
53534                                      KnownZero, DCI))
53535     return SDValue(N, 0);
53536 
53537   return SDValue();
53538 }
53539 
53540 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
53541 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
53542 // extra instructions between the conversion due to going to scalar and back.
53543 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
53544                                  const X86Subtarget &Subtarget) {
53545   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
53546     return SDValue();
53547 
53548   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
53549     return SDValue();
53550 
53551   if (N->getValueType(0) != MVT::f32 ||
53552       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
53553     return SDValue();
53554 
53555   SDLoc dl(N);
53556   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
53557                             N->getOperand(0).getOperand(0));
53558   Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
53559                     DAG.getTargetConstant(4, dl, MVT::i32));
53560   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
53561   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
53562                      DAG.getIntPtrConstant(0, dl));
53563 }
53564 
53565 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
53566                                 const X86Subtarget &Subtarget) {
53567   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
53568     return SDValue();
53569 
53570   if (Subtarget.hasFP16())
53571     return SDValue();
53572 
53573   bool IsStrict = N->isStrictFPOpcode();
53574   EVT VT = N->getValueType(0);
53575   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53576   EVT SrcVT = Src.getValueType();
53577 
53578   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
53579     return SDValue();
53580 
53581   if (VT.getVectorElementType() != MVT::f32 &&
53582       VT.getVectorElementType() != MVT::f64)
53583     return SDValue();
53584 
53585   unsigned NumElts = VT.getVectorNumElements();
53586   if (NumElts == 1 || !isPowerOf2_32(NumElts))
53587     return SDValue();
53588 
53589   SDLoc dl(N);
53590 
53591   // Convert the input to vXi16.
53592   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
53593   Src = DAG.getBitcast(IntVT, Src);
53594 
53595   // Widen to at least 8 input elements.
53596   if (NumElts < 8) {
53597     unsigned NumConcats = 8 / NumElts;
53598     SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
53599                                 : DAG.getConstant(0, dl, IntVT);
53600     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
53601     Ops[0] = Src;
53602     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
53603   }
53604 
53605   // Destination is vXf32 with at least 4 elements.
53606   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
53607                                std::max(4U, NumElts));
53608   SDValue Cvt, Chain;
53609   if (IsStrict) {
53610     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
53611                       {N->getOperand(0), Src});
53612     Chain = Cvt.getValue(1);
53613   } else {
53614     Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
53615   }
53616 
53617   if (NumElts < 4) {
53618     assert(NumElts == 2 && "Unexpected size");
53619     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
53620                       DAG.getIntPtrConstant(0, dl));
53621   }
53622 
53623   if (IsStrict) {
53624     // Extend to the original VT if necessary.
53625     if (Cvt.getValueType() != VT) {
53626       Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
53627                         {Chain, Cvt});
53628       Chain = Cvt.getValue(1);
53629     }
53630     return DAG.getMergeValues({Cvt, Chain}, dl);
53631   }
53632 
53633   // Extend to the original VT if necessary.
53634   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
53635 }
53636 
53637 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
53638 // from. Limit this to cases where the loads have the same input chain and the
53639 // output chains are unused. This avoids any memory ordering issues.
53640 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
53641                                      TargetLowering::DAGCombinerInfo &DCI) {
53642   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53643           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
53644          "Unknown broadcast load type");
53645 
53646   // Only do this if the chain result is unused.
53647   if (N->hasAnyUseOfValue(1))
53648     return SDValue();
53649 
53650   auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
53651 
53652   SDValue Ptr = MemIntrin->getBasePtr();
53653   SDValue Chain = MemIntrin->getChain();
53654   EVT VT = N->getSimpleValueType(0);
53655   EVT MemVT = MemIntrin->getMemoryVT();
53656 
53657   // Look at other users of our base pointer and try to find a wider broadcast.
53658   // The input chain and the size of the memory VT must match.
53659   for (SDNode *User : Ptr->uses())
53660     if (User != N && User->getOpcode() == N->getOpcode() &&
53661         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
53662         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
53663         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
53664             MemVT.getSizeInBits() &&
53665         !User->hasAnyUseOfValue(1) &&
53666         User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
53667       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
53668                                          VT.getSizeInBits());
53669       Extract = DAG.getBitcast(VT, Extract);
53670       return DCI.CombineTo(N, Extract, SDValue(User, 1));
53671     }
53672 
53673   return SDValue();
53674 }
53675 
53676 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
53677                                const X86Subtarget &Subtarget) {
53678   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
53679     return SDValue();
53680 
53681   if (Subtarget.hasFP16())
53682     return SDValue();
53683 
53684   EVT VT = N->getValueType(0);
53685   SDValue Src = N->getOperand(0);
53686   EVT SrcVT = Src.getValueType();
53687 
53688   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
53689       SrcVT.getVectorElementType() != MVT::f32)
53690     return SDValue();
53691 
53692   unsigned NumElts = VT.getVectorNumElements();
53693   if (NumElts == 1 || !isPowerOf2_32(NumElts))
53694     return SDValue();
53695 
53696   SDLoc dl(N);
53697 
53698   // Widen to at least 4 input elements.
53699   if (NumElts < 4)
53700     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
53701                       DAG.getConstantFP(0.0, dl, SrcVT));
53702 
53703   // Destination is v8i16 with at least 8 elements.
53704   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53705                                std::max(8U, NumElts));
53706   SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
53707                             DAG.getTargetConstant(4, dl, MVT::i32));
53708 
53709   // Extract down to real number of elements.
53710   if (NumElts < 8) {
53711     EVT IntVT = VT.changeVectorElementTypeToInteger();
53712     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
53713                       DAG.getIntPtrConstant(0, dl));
53714   }
53715 
53716   return DAG.getBitcast(VT, Cvt);
53717 }
53718 
53719 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
53720   SDValue Src = N->getOperand(0);
53721 
53722   // Turn MOVDQ2Q+simple_load into an mmx load.
53723   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
53724     LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
53725 
53726     if (LN->isSimple()) {
53727       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
53728                                   LN->getBasePtr(),
53729                                   LN->getPointerInfo(),
53730                                   LN->getOriginalAlign(),
53731                                   LN->getMemOperand()->getFlags());
53732       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
53733       return NewLd;
53734     }
53735   }
53736 
53737   return SDValue();
53738 }
53739 
53740 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
53741                            TargetLowering::DAGCombinerInfo &DCI) {
53742   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
53743   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53744   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
53745     return SDValue(N, 0);
53746 
53747   return SDValue();
53748 }
53749 
53750 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
53751                                              DAGCombinerInfo &DCI) const {
53752   SelectionDAG &DAG = DCI.DAG;
53753   switch (N->getOpcode()) {
53754   default: break;
53755   case ISD::SCALAR_TO_VECTOR:
53756     return combineScalarToVector(N, DAG);
53757   case ISD::EXTRACT_VECTOR_ELT:
53758   case X86ISD::PEXTRW:
53759   case X86ISD::PEXTRB:
53760     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
53761   case ISD::CONCAT_VECTORS:
53762     return combineConcatVectors(N, DAG, DCI, Subtarget);
53763   case ISD::INSERT_SUBVECTOR:
53764     return combineInsertSubvector(N, DAG, DCI, Subtarget);
53765   case ISD::EXTRACT_SUBVECTOR:
53766     return combineExtractSubvector(N, DAG, DCI, Subtarget);
53767   case ISD::VSELECT:
53768   case ISD::SELECT:
53769   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
53770   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
53771   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
53772   case X86ISD::CMP:         return combineCMP(N, DAG);
53773   case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
53774   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
53775   case X86ISD::ADD:
53776   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
53777   case X86ISD::SBB:         return combineSBB(N, DAG);
53778   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
53779   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
53780   case ISD::SHL:            return combineShiftLeft(N, DAG);
53781   case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
53782   case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
53783   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
53784   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
53785   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
53786   case X86ISD::BEXTR:
53787   case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
53788   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
53789   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
53790   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
53791   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
53792   case X86ISD::VEXTRACT_STORE:
53793     return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
53794   case ISD::SINT_TO_FP:
53795   case ISD::STRICT_SINT_TO_FP:
53796     return combineSIntToFP(N, DAG, DCI, Subtarget);
53797   case ISD::UINT_TO_FP:
53798   case ISD::STRICT_UINT_TO_FP:
53799     return combineUIntToFP(N, DAG, Subtarget);
53800   case ISD::FADD:
53801   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
53802   case X86ISD::VFCMULC:
53803   case X86ISD::VFMULC:      return combineFMulcFCMulc(N, DAG, Subtarget);
53804   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
53805   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
53806   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
53807   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
53808   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
53809   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
53810   case X86ISD::FXOR:
53811   case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
53812   case X86ISD::FMIN:
53813   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
53814   case ISD::FMINNUM:
53815   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
53816   case X86ISD::CVTSI2P:
53817   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
53818   case X86ISD::CVTP2SI:
53819   case X86ISD::CVTP2UI:
53820   case X86ISD::STRICT_CVTTP2SI:
53821   case X86ISD::CVTTP2SI:
53822   case X86ISD::STRICT_CVTTP2UI:
53823   case X86ISD::CVTTP2UI:
53824                             return combineCVTP2I_CVTTP2I(N, DAG, DCI);
53825   case X86ISD::STRICT_CVTPH2PS:
53826   case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
53827   case X86ISD::BT:          return combineBT(N, DAG, DCI);
53828   case ISD::ANY_EXTEND:
53829   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
53830   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
53831   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
53832   case ISD::ANY_EXTEND_VECTOR_INREG:
53833   case ISD::SIGN_EXTEND_VECTOR_INREG:
53834   case ISD::ZERO_EXTEND_VECTOR_INREG:
53835     return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
53836   case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
53837   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
53838   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
53839   case X86ISD::PACKSS:
53840   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
53841   case X86ISD::HADD:
53842   case X86ISD::HSUB:
53843   case X86ISD::FHADD:
53844   case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
53845   case X86ISD::VSHL:
53846   case X86ISD::VSRA:
53847   case X86ISD::VSRL:
53848     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
53849   case X86ISD::VSHLI:
53850   case X86ISD::VSRAI:
53851   case X86ISD::VSRLI:
53852     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
53853   case ISD::INSERT_VECTOR_ELT:
53854   case X86ISD::PINSRB:
53855   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
53856   case X86ISD::SHUFP:       // Handle all target specific shuffles
53857   case X86ISD::INSERTPS:
53858   case X86ISD::EXTRQI:
53859   case X86ISD::INSERTQI:
53860   case X86ISD::VALIGN:
53861   case X86ISD::PALIGNR:
53862   case X86ISD::VSHLDQ:
53863   case X86ISD::VSRLDQ:
53864   case X86ISD::BLENDI:
53865   case X86ISD::UNPCKH:
53866   case X86ISD::UNPCKL:
53867   case X86ISD::MOVHLPS:
53868   case X86ISD::MOVLHPS:
53869   case X86ISD::PSHUFB:
53870   case X86ISD::PSHUFD:
53871   case X86ISD::PSHUFHW:
53872   case X86ISD::PSHUFLW:
53873   case X86ISD::MOVSHDUP:
53874   case X86ISD::MOVSLDUP:
53875   case X86ISD::MOVDDUP:
53876   case X86ISD::MOVSS:
53877   case X86ISD::MOVSD:
53878   case X86ISD::MOVSH:
53879   case X86ISD::VBROADCAST:
53880   case X86ISD::VPPERM:
53881   case X86ISD::VPERMI:
53882   case X86ISD::VPERMV:
53883   case X86ISD::VPERMV3:
53884   case X86ISD::VPERMIL2:
53885   case X86ISD::VPERMILPI:
53886   case X86ISD::VPERMILPV:
53887   case X86ISD::VPERM2X128:
53888   case X86ISD::SHUF128:
53889   case X86ISD::VZEXT_MOVL:
53890   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
53891   case X86ISD::FMADD_RND:
53892   case X86ISD::FMSUB:
53893   case X86ISD::STRICT_FMSUB:
53894   case X86ISD::FMSUB_RND:
53895   case X86ISD::FNMADD:
53896   case X86ISD::STRICT_FNMADD:
53897   case X86ISD::FNMADD_RND:
53898   case X86ISD::FNMSUB:
53899   case X86ISD::STRICT_FNMSUB:
53900   case X86ISD::FNMSUB_RND:
53901   case ISD::FMA:
53902   case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
53903   case X86ISD::FMADDSUB_RND:
53904   case X86ISD::FMSUBADD_RND:
53905   case X86ISD::FMADDSUB:
53906   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
53907   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
53908   case X86ISD::MGATHER:
53909   case X86ISD::MSCATTER:
53910     return combineX86GatherScatter(N, DAG, DCI, Subtarget);
53911   case ISD::MGATHER:
53912   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
53913   case X86ISD::PCMPEQ:
53914   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
53915   case X86ISD::PMULDQ:
53916   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
53917   case X86ISD::VPMADDUBSW:
53918   case X86ISD::VPMADDWD:    return combineVPMADD(N, DAG, DCI);
53919   case X86ISD::KSHIFTL:
53920   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
53921   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
53922   case ISD::STRICT_FP_EXTEND:
53923   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
53924   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
53925   case X86ISD::VBROADCAST_LOAD:
53926   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
53927   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
53928   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
53929   }
53930 
53931   return SDValue();
53932 }
53933 
53934 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
53935   if (!isTypeLegal(VT))
53936     return false;
53937 
53938   // There are no vXi8 shifts.
53939   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
53940     return false;
53941 
53942   // TODO: Almost no 8-bit ops are desirable because they have no actual
53943   //       size/speed advantages vs. 32-bit ops, but they do have a major
53944   //       potential disadvantage by causing partial register stalls.
53945   //
53946   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
53947   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
53948   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
53949   // check for a constant operand to the multiply.
53950   if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
53951     return false;
53952 
53953   // i16 instruction encodings are longer and some i16 instructions are slow,
53954   // so those are not desirable.
53955   if (VT == MVT::i16) {
53956     switch (Opc) {
53957     default:
53958       break;
53959     case ISD::LOAD:
53960     case ISD::SIGN_EXTEND:
53961     case ISD::ZERO_EXTEND:
53962     case ISD::ANY_EXTEND:
53963     case ISD::SHL:
53964     case ISD::SRA:
53965     case ISD::SRL:
53966     case ISD::SUB:
53967     case ISD::ADD:
53968     case ISD::MUL:
53969     case ISD::AND:
53970     case ISD::OR:
53971     case ISD::XOR:
53972       return false;
53973     }
53974   }
53975 
53976   // Any legal type not explicitly accounted for above here is desirable.
53977   return true;
53978 }
53979 
53980 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
53981                                                   SDValue Value, SDValue Addr,
53982                                                   SelectionDAG &DAG) const {
53983   const Module *M = DAG.getMachineFunction().getMMI().getModule();
53984   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
53985   if (IsCFProtectionSupported) {
53986     // In case control-flow branch protection is enabled, we need to add
53987     // notrack prefix to the indirect branch.
53988     // In order to do that we create NT_BRIND SDNode.
53989     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
53990     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
53991   }
53992 
53993   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
53994 }
53995 
53996 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
53997   EVT VT = Op.getValueType();
53998   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
53999                              isa<ConstantSDNode>(Op.getOperand(1));
54000 
54001   // i16 is legal, but undesirable since i16 instruction encodings are longer
54002   // and some i16 instructions are slow.
54003   // 8-bit multiply-by-constant can usually be expanded to something cheaper
54004   // using LEA and/or other ALU ops.
54005   if (VT != MVT::i16 && !Is8BitMulByConstant)
54006     return false;
54007 
54008   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
54009     if (!Op.hasOneUse())
54010       return false;
54011     SDNode *User = *Op->use_begin();
54012     if (!ISD::isNormalStore(User))
54013       return false;
54014     auto *Ld = cast<LoadSDNode>(Load);
54015     auto *St = cast<StoreSDNode>(User);
54016     return Ld->getBasePtr() == St->getBasePtr();
54017   };
54018 
54019   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
54020     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
54021       return false;
54022     if (!Op.hasOneUse())
54023       return false;
54024     SDNode *User = *Op->use_begin();
54025     if (User->getOpcode() != ISD::ATOMIC_STORE)
54026       return false;
54027     auto *Ld = cast<AtomicSDNode>(Load);
54028     auto *St = cast<AtomicSDNode>(User);
54029     return Ld->getBasePtr() == St->getBasePtr();
54030   };
54031 
54032   bool Commute = false;
54033   switch (Op.getOpcode()) {
54034   default: return false;
54035   case ISD::SIGN_EXTEND:
54036   case ISD::ZERO_EXTEND:
54037   case ISD::ANY_EXTEND:
54038     break;
54039   case ISD::SHL:
54040   case ISD::SRA:
54041   case ISD::SRL: {
54042     SDValue N0 = Op.getOperand(0);
54043     // Look out for (store (shl (load), x)).
54044     if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
54045       return false;
54046     break;
54047   }
54048   case ISD::ADD:
54049   case ISD::MUL:
54050   case ISD::AND:
54051   case ISD::OR:
54052   case ISD::XOR:
54053     Commute = true;
54054     LLVM_FALLTHROUGH;
54055   case ISD::SUB: {
54056     SDValue N0 = Op.getOperand(0);
54057     SDValue N1 = Op.getOperand(1);
54058     // Avoid disabling potential load folding opportunities.
54059     if (X86::mayFoldLoad(N1, Subtarget) &&
54060         (!Commute || !isa<ConstantSDNode>(N0) ||
54061          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
54062       return false;
54063     if (X86::mayFoldLoad(N0, Subtarget) &&
54064         ((Commute && !isa<ConstantSDNode>(N1)) ||
54065          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
54066       return false;
54067     if (IsFoldableAtomicRMW(N0, Op) ||
54068         (Commute && IsFoldableAtomicRMW(N1, Op)))
54069       return false;
54070   }
54071   }
54072 
54073   PVT = MVT::i32;
54074   return true;
54075 }
54076 
54077 //===----------------------------------------------------------------------===//
54078 //                           X86 Inline Assembly Support
54079 //===----------------------------------------------------------------------===//
54080 
54081 // Helper to match a string separated by whitespace.
54082 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
54083   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
54084 
54085   for (StringRef Piece : Pieces) {
54086     if (!S.startswith(Piece)) // Check if the piece matches.
54087       return false;
54088 
54089     S = S.substr(Piece.size());
54090     StringRef::size_type Pos = S.find_first_not_of(" \t");
54091     if (Pos == 0) // We matched a prefix.
54092       return false;
54093 
54094     S = S.substr(Pos);
54095   }
54096 
54097   return S.empty();
54098 }
54099 
54100 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
54101 
54102   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
54103     if (llvm::is_contained(AsmPieces, "~{cc}") &&
54104         llvm::is_contained(AsmPieces, "~{flags}") &&
54105         llvm::is_contained(AsmPieces, "~{fpsr}")) {
54106 
54107       if (AsmPieces.size() == 3)
54108         return true;
54109       else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
54110         return true;
54111     }
54112   }
54113   return false;
54114 }
54115 
54116 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
54117   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
54118 
54119   const std::string &AsmStr = IA->getAsmString();
54120 
54121   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
54122   if (!Ty || Ty->getBitWidth() % 16 != 0)
54123     return false;
54124 
54125   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
54126   SmallVector<StringRef, 4> AsmPieces;
54127   SplitString(AsmStr, AsmPieces, ";\n");
54128 
54129   switch (AsmPieces.size()) {
54130   default: return false;
54131   case 1:
54132     // FIXME: this should verify that we are targeting a 486 or better.  If not,
54133     // we will turn this bswap into something that will be lowered to logical
54134     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
54135     // lower so don't worry about this.
54136     // bswap $0
54137     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
54138         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
54139         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
54140         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
54141         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
54142         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
54143       // No need to check constraints, nothing other than the equivalent of
54144       // "=r,0" would be valid here.
54145       return IntrinsicLowering::LowerToByteSwap(CI);
54146     }
54147 
54148     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
54149     if (CI->getType()->isIntegerTy(16) &&
54150         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
54151         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
54152          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
54153       AsmPieces.clear();
54154       StringRef ConstraintsStr = IA->getConstraintString();
54155       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
54156       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
54157       if (clobbersFlagRegisters(AsmPieces))
54158         return IntrinsicLowering::LowerToByteSwap(CI);
54159     }
54160     break;
54161   case 3:
54162     if (CI->getType()->isIntegerTy(32) &&
54163         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
54164         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
54165         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
54166         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
54167       AsmPieces.clear();
54168       StringRef ConstraintsStr = IA->getConstraintString();
54169       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
54170       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
54171       if (clobbersFlagRegisters(AsmPieces))
54172         return IntrinsicLowering::LowerToByteSwap(CI);
54173     }
54174 
54175     if (CI->getType()->isIntegerTy(64)) {
54176       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
54177       if (Constraints.size() >= 2 &&
54178           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
54179           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
54180         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
54181         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
54182             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
54183             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
54184           return IntrinsicLowering::LowerToByteSwap(CI);
54185       }
54186     }
54187     break;
54188   }
54189   return false;
54190 }
54191 
54192 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
54193   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
54194                            .Case("{@cca}", X86::COND_A)
54195                            .Case("{@ccae}", X86::COND_AE)
54196                            .Case("{@ccb}", X86::COND_B)
54197                            .Case("{@ccbe}", X86::COND_BE)
54198                            .Case("{@ccc}", X86::COND_B)
54199                            .Case("{@cce}", X86::COND_E)
54200                            .Case("{@ccz}", X86::COND_E)
54201                            .Case("{@ccg}", X86::COND_G)
54202                            .Case("{@ccge}", X86::COND_GE)
54203                            .Case("{@ccl}", X86::COND_L)
54204                            .Case("{@ccle}", X86::COND_LE)
54205                            .Case("{@ccna}", X86::COND_BE)
54206                            .Case("{@ccnae}", X86::COND_B)
54207                            .Case("{@ccnb}", X86::COND_AE)
54208                            .Case("{@ccnbe}", X86::COND_A)
54209                            .Case("{@ccnc}", X86::COND_AE)
54210                            .Case("{@ccne}", X86::COND_NE)
54211                            .Case("{@ccnz}", X86::COND_NE)
54212                            .Case("{@ccng}", X86::COND_LE)
54213                            .Case("{@ccnge}", X86::COND_L)
54214                            .Case("{@ccnl}", X86::COND_GE)
54215                            .Case("{@ccnle}", X86::COND_G)
54216                            .Case("{@ccno}", X86::COND_NO)
54217                            .Case("{@ccnp}", X86::COND_NP)
54218                            .Case("{@ccns}", X86::COND_NS)
54219                            .Case("{@cco}", X86::COND_O)
54220                            .Case("{@ccp}", X86::COND_P)
54221                            .Case("{@ccs}", X86::COND_S)
54222                            .Default(X86::COND_INVALID);
54223   return Cond;
54224 }
54225 
54226 /// Given a constraint letter, return the type of constraint for this target.
54227 X86TargetLowering::ConstraintType
54228 X86TargetLowering::getConstraintType(StringRef Constraint) const {
54229   if (Constraint.size() == 1) {
54230     switch (Constraint[0]) {
54231     case 'R':
54232     case 'q':
54233     case 'Q':
54234     case 'f':
54235     case 't':
54236     case 'u':
54237     case 'y':
54238     case 'x':
54239     case 'v':
54240     case 'l':
54241     case 'k': // AVX512 masking registers.
54242       return C_RegisterClass;
54243     case 'a':
54244     case 'b':
54245     case 'c':
54246     case 'd':
54247     case 'S':
54248     case 'D':
54249     case 'A':
54250       return C_Register;
54251     case 'I':
54252     case 'J':
54253     case 'K':
54254     case 'N':
54255     case 'G':
54256     case 'L':
54257     case 'M':
54258       return C_Immediate;
54259     case 'C':
54260     case 'e':
54261     case 'Z':
54262       return C_Other;
54263     default:
54264       break;
54265     }
54266   }
54267   else if (Constraint.size() == 2) {
54268     switch (Constraint[0]) {
54269     default:
54270       break;
54271     case 'Y':
54272       switch (Constraint[1]) {
54273       default:
54274         break;
54275       case 'z':
54276         return C_Register;
54277       case 'i':
54278       case 'm':
54279       case 'k':
54280       case 't':
54281       case '2':
54282         return C_RegisterClass;
54283       }
54284     }
54285   } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
54286     return C_Other;
54287   return TargetLowering::getConstraintType(Constraint);
54288 }
54289 
54290 /// Examine constraint type and operand type and determine a weight value.
54291 /// This object must already have been set up with the operand type
54292 /// and the current alternative constraint selected.
54293 TargetLowering::ConstraintWeight
54294   X86TargetLowering::getSingleConstraintMatchWeight(
54295     AsmOperandInfo &info, const char *constraint) const {
54296   ConstraintWeight weight = CW_Invalid;
54297   Value *CallOperandVal = info.CallOperandVal;
54298     // If we don't have a value, we can't do a match,
54299     // but allow it at the lowest weight.
54300   if (!CallOperandVal)
54301     return CW_Default;
54302   Type *type = CallOperandVal->getType();
54303   // Look at the constraint type.
54304   switch (*constraint) {
54305   default:
54306     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
54307     LLVM_FALLTHROUGH;
54308   case 'R':
54309   case 'q':
54310   case 'Q':
54311   case 'a':
54312   case 'b':
54313   case 'c':
54314   case 'd':
54315   case 'S':
54316   case 'D':
54317   case 'A':
54318     if (CallOperandVal->getType()->isIntegerTy())
54319       weight = CW_SpecificReg;
54320     break;
54321   case 'f':
54322   case 't':
54323   case 'u':
54324     if (type->isFloatingPointTy())
54325       weight = CW_SpecificReg;
54326     break;
54327   case 'y':
54328     if (type->isX86_MMXTy() && Subtarget.hasMMX())
54329       weight = CW_SpecificReg;
54330     break;
54331   case 'Y':
54332     if (StringRef(constraint).size() != 2)
54333       break;
54334     switch (constraint[1]) {
54335       default:
54336         return CW_Invalid;
54337       // XMM0
54338       case 'z':
54339         if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
54340             ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
54341             ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
54342           return CW_SpecificReg;
54343         return CW_Invalid;
54344       // Conditional OpMask regs (AVX512)
54345       case 'k':
54346         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
54347           return CW_Register;
54348         return CW_Invalid;
54349       // Any MMX reg
54350       case 'm':
54351         if (type->isX86_MMXTy() && Subtarget.hasMMX())
54352           return weight;
54353         return CW_Invalid;
54354       // Any SSE reg when ISA >= SSE2, same as 'x'
54355       case 'i':
54356       case 't':
54357       case '2':
54358         if (!Subtarget.hasSSE2())
54359           return CW_Invalid;
54360         break;
54361     }
54362     break;
54363   case 'v':
54364     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
54365       weight = CW_Register;
54366     LLVM_FALLTHROUGH;
54367   case 'x':
54368     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
54369         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
54370       weight = CW_Register;
54371     break;
54372   case 'k':
54373     // Enable conditional vector operations using %k<#> registers.
54374     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
54375       weight = CW_Register;
54376     break;
54377   case 'I':
54378     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
54379       if (C->getZExtValue() <= 31)
54380         weight = CW_Constant;
54381     }
54382     break;
54383   case 'J':
54384     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54385       if (C->getZExtValue() <= 63)
54386         weight = CW_Constant;
54387     }
54388     break;
54389   case 'K':
54390     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54391       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
54392         weight = CW_Constant;
54393     }
54394     break;
54395   case 'L':
54396     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54397       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
54398         weight = CW_Constant;
54399     }
54400     break;
54401   case 'M':
54402     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54403       if (C->getZExtValue() <= 3)
54404         weight = CW_Constant;
54405     }
54406     break;
54407   case 'N':
54408     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54409       if (C->getZExtValue() <= 0xff)
54410         weight = CW_Constant;
54411     }
54412     break;
54413   case 'G':
54414   case 'C':
54415     if (isa<ConstantFP>(CallOperandVal)) {
54416       weight = CW_Constant;
54417     }
54418     break;
54419   case 'e':
54420     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54421       if ((C->getSExtValue() >= -0x80000000LL) &&
54422           (C->getSExtValue() <= 0x7fffffffLL))
54423         weight = CW_Constant;
54424     }
54425     break;
54426   case 'Z':
54427     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
54428       if (C->getZExtValue() <= 0xffffffff)
54429         weight = CW_Constant;
54430     }
54431     break;
54432   }
54433   return weight;
54434 }
54435 
54436 /// Try to replace an X constraint, which matches anything, with another that
54437 /// has more specific requirements based on the type of the corresponding
54438 /// operand.
54439 const char *X86TargetLowering::
54440 LowerXConstraint(EVT ConstraintVT) const {
54441   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
54442   // 'f' like normal targets.
54443   if (ConstraintVT.isFloatingPoint()) {
54444     if (Subtarget.hasSSE1())
54445       return "x";
54446   }
54447 
54448   return TargetLowering::LowerXConstraint(ConstraintVT);
54449 }
54450 
54451 // Lower @cc targets via setcc.
54452 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
54453     SDValue &Chain, SDValue &Flag, const SDLoc &DL,
54454     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
54455   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
54456   if (Cond == X86::COND_INVALID)
54457     return SDValue();
54458   // Check that return type is valid.
54459   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
54460       OpInfo.ConstraintVT.getSizeInBits() < 8)
54461     report_fatal_error("Flag output operand is of invalid type");
54462 
54463   // Get EFLAGS register. Only update chain when copyfrom is glued.
54464   if (Flag.getNode()) {
54465     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
54466     Chain = Flag.getValue(1);
54467   } else
54468     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
54469   // Extract CC code.
54470   SDValue CC = getSETCC(Cond, Flag, DL, DAG);
54471   // Extend to 32-bits
54472   SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
54473 
54474   return Result;
54475 }
54476 
54477 /// Lower the specified operand into the Ops vector.
54478 /// If it is invalid, don't add anything to Ops.
54479 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
54480                                                      std::string &Constraint,
54481                                                      std::vector<SDValue>&Ops,
54482                                                      SelectionDAG &DAG) const {
54483   SDValue Result;
54484 
54485   // Only support length 1 constraints for now.
54486   if (Constraint.length() > 1) return;
54487 
54488   char ConstraintLetter = Constraint[0];
54489   switch (ConstraintLetter) {
54490   default: break;
54491   case 'I':
54492     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54493       if (C->getZExtValue() <= 31) {
54494         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54495                                        Op.getValueType());
54496         break;
54497       }
54498     }
54499     return;
54500   case 'J':
54501     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54502       if (C->getZExtValue() <= 63) {
54503         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54504                                        Op.getValueType());
54505         break;
54506       }
54507     }
54508     return;
54509   case 'K':
54510     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54511       if (isInt<8>(C->getSExtValue())) {
54512         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54513                                        Op.getValueType());
54514         break;
54515       }
54516     }
54517     return;
54518   case 'L':
54519     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54520       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
54521           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
54522         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
54523                                        Op.getValueType());
54524         break;
54525       }
54526     }
54527     return;
54528   case 'M':
54529     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54530       if (C->getZExtValue() <= 3) {
54531         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54532                                        Op.getValueType());
54533         break;
54534       }
54535     }
54536     return;
54537   case 'N':
54538     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54539       if (C->getZExtValue() <= 255) {
54540         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54541                                        Op.getValueType());
54542         break;
54543       }
54544     }
54545     return;
54546   case 'O':
54547     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54548       if (C->getZExtValue() <= 127) {
54549         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54550                                        Op.getValueType());
54551         break;
54552       }
54553     }
54554     return;
54555   case 'e': {
54556     // 32-bit signed value
54557     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54558       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
54559                                            C->getSExtValue())) {
54560         // Widen to 64 bits here to get it sign extended.
54561         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
54562         break;
54563       }
54564     // FIXME gcc accepts some relocatable values here too, but only in certain
54565     // memory models; it's complicated.
54566     }
54567     return;
54568   }
54569   case 'Z': {
54570     // 32-bit unsigned value
54571     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
54572       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
54573                                            C->getZExtValue())) {
54574         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
54575                                        Op.getValueType());
54576         break;
54577       }
54578     }
54579     // FIXME gcc accepts some relocatable values here too, but only in certain
54580     // memory models; it's complicated.
54581     return;
54582   }
54583   case 'i': {
54584     // Literal immediates are always ok.
54585     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
54586       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
54587       BooleanContent BCont = getBooleanContents(MVT::i64);
54588       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
54589                                     : ISD::SIGN_EXTEND;
54590       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
54591                                                   : CST->getSExtValue();
54592       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
54593       break;
54594     }
54595 
54596     // In any sort of PIC mode addresses need to be computed at runtime by
54597     // adding in a register or some sort of table lookup.  These can't
54598     // be used as immediates. BlockAddresses are fine though.
54599     if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
54600         !isa<BlockAddressSDNode>(Op))
54601       return;
54602 
54603     // If we are in non-pic codegen mode, we allow the address of a global (with
54604     // an optional displacement) to be used with 'i'.
54605     if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
54606       // If we require an extra load to get this address, as in PIC mode, we
54607       // can't accept it.
54608       if (isGlobalStubReference(
54609               Subtarget.classifyGlobalReference(GA->getGlobal())))
54610         return;
54611     break;
54612   }
54613   }
54614 
54615   if (Result.getNode()) {
54616     Ops.push_back(Result);
54617     return;
54618   }
54619   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
54620 }
54621 
54622 /// Check if \p RC is a general purpose register class.
54623 /// I.e., GR* or one of their variant.
54624 static bool isGRClass(const TargetRegisterClass &RC) {
54625   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
54626          RC.hasSuperClassEq(&X86::GR16RegClass) ||
54627          RC.hasSuperClassEq(&X86::GR32RegClass) ||
54628          RC.hasSuperClassEq(&X86::GR64RegClass) ||
54629          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
54630 }
54631 
54632 /// Check if \p RC is a vector register class.
54633 /// I.e., FR* / VR* or one of their variant.
54634 static bool isFRClass(const TargetRegisterClass &RC) {
54635   return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
54636          RC.hasSuperClassEq(&X86::FR32XRegClass) ||
54637          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
54638          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
54639          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
54640          RC.hasSuperClassEq(&X86::VR512RegClass);
54641 }
54642 
54643 /// Check if \p RC is a mask register class.
54644 /// I.e., VK* or one of their variant.
54645 static bool isVKClass(const TargetRegisterClass &RC) {
54646   return RC.hasSuperClassEq(&X86::VK1RegClass) ||
54647          RC.hasSuperClassEq(&X86::VK2RegClass) ||
54648          RC.hasSuperClassEq(&X86::VK4RegClass) ||
54649          RC.hasSuperClassEq(&X86::VK8RegClass) ||
54650          RC.hasSuperClassEq(&X86::VK16RegClass) ||
54651          RC.hasSuperClassEq(&X86::VK32RegClass) ||
54652          RC.hasSuperClassEq(&X86::VK64RegClass);
54653 }
54654 
54655 std::pair<unsigned, const TargetRegisterClass *>
54656 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
54657                                                 StringRef Constraint,
54658                                                 MVT VT) const {
54659   // First, see if this is a constraint that directly corresponds to an LLVM
54660   // register class.
54661   if (Constraint.size() == 1) {
54662     // GCC Constraint Letters
54663     switch (Constraint[0]) {
54664     default: break;
54665     // 'A' means [ER]AX + [ER]DX.
54666     case 'A':
54667       if (Subtarget.is64Bit())
54668         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
54669       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
54670              "Expecting 64, 32 or 16 bit subtarget");
54671       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
54672 
54673       // TODO: Slight differences here in allocation order and leaving
54674       // RIP in the class. Do they matter any more here than they do
54675       // in the normal allocation?
54676     case 'k':
54677       if (Subtarget.hasAVX512()) {
54678         if (VT == MVT::i1)
54679           return std::make_pair(0U, &X86::VK1RegClass);
54680         if (VT == MVT::i8)
54681           return std::make_pair(0U, &X86::VK8RegClass);
54682         if (VT == MVT::i16)
54683           return std::make_pair(0U, &X86::VK16RegClass);
54684       }
54685       if (Subtarget.hasBWI()) {
54686         if (VT == MVT::i32)
54687           return std::make_pair(0U, &X86::VK32RegClass);
54688         if (VT == MVT::i64)
54689           return std::make_pair(0U, &X86::VK64RegClass);
54690       }
54691       break;
54692     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
54693       if (Subtarget.is64Bit()) {
54694         if (VT == MVT::i8 || VT == MVT::i1)
54695           return std::make_pair(0U, &X86::GR8RegClass);
54696         if (VT == MVT::i16)
54697           return std::make_pair(0U, &X86::GR16RegClass);
54698         if (VT == MVT::i32 || VT == MVT::f32)
54699           return std::make_pair(0U, &X86::GR32RegClass);
54700         if (VT != MVT::f80 && !VT.isVector())
54701           return std::make_pair(0U, &X86::GR64RegClass);
54702         break;
54703       }
54704       LLVM_FALLTHROUGH;
54705       // 32-bit fallthrough
54706     case 'Q':   // Q_REGS
54707       if (VT == MVT::i8 || VT == MVT::i1)
54708         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
54709       if (VT == MVT::i16)
54710         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
54711       if (VT == MVT::i32 || VT == MVT::f32 ||
54712           (!VT.isVector() && !Subtarget.is64Bit()))
54713         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
54714       if (VT != MVT::f80 && !VT.isVector())
54715         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
54716       break;
54717     case 'r':   // GENERAL_REGS
54718     case 'l':   // INDEX_REGS
54719       if (VT == MVT::i8 || VT == MVT::i1)
54720         return std::make_pair(0U, &X86::GR8RegClass);
54721       if (VT == MVT::i16)
54722         return std::make_pair(0U, &X86::GR16RegClass);
54723       if (VT == MVT::i32 || VT == MVT::f32 ||
54724           (!VT.isVector() && !Subtarget.is64Bit()))
54725         return std::make_pair(0U, &X86::GR32RegClass);
54726       if (VT != MVT::f80 && !VT.isVector())
54727         return std::make_pair(0U, &X86::GR64RegClass);
54728       break;
54729     case 'R':   // LEGACY_REGS
54730       if (VT == MVT::i8 || VT == MVT::i1)
54731         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
54732       if (VT == MVT::i16)
54733         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
54734       if (VT == MVT::i32 || VT == MVT::f32 ||
54735           (!VT.isVector() && !Subtarget.is64Bit()))
54736         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
54737       if (VT != MVT::f80 && !VT.isVector())
54738         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
54739       break;
54740     case 'f':  // FP Stack registers.
54741       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
54742       // value to the correct fpstack register class.
54743       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
54744         return std::make_pair(0U, &X86::RFP32RegClass);
54745       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
54746         return std::make_pair(0U, &X86::RFP64RegClass);
54747       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
54748         return std::make_pair(0U, &X86::RFP80RegClass);
54749       break;
54750     case 'y':   // MMX_REGS if MMX allowed.
54751       if (!Subtarget.hasMMX()) break;
54752       return std::make_pair(0U, &X86::VR64RegClass);
54753     case 'v':
54754     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
54755       if (!Subtarget.hasSSE1()) break;
54756       bool VConstraint = (Constraint[0] == 'v');
54757 
54758       switch (VT.SimpleTy) {
54759       default: break;
54760       // Scalar SSE types.
54761       case MVT::f16:
54762         if (VConstraint && Subtarget.hasFP16())
54763           return std::make_pair(0U, &X86::FR16XRegClass);
54764         break;
54765       case MVT::f32:
54766       case MVT::i32:
54767         if (VConstraint && Subtarget.hasVLX())
54768           return std::make_pair(0U, &X86::FR32XRegClass);
54769         return std::make_pair(0U, &X86::FR32RegClass);
54770       case MVT::f64:
54771       case MVT::i64:
54772         if (VConstraint && Subtarget.hasVLX())
54773           return std::make_pair(0U, &X86::FR64XRegClass);
54774         return std::make_pair(0U, &X86::FR64RegClass);
54775       case MVT::i128:
54776         if (Subtarget.is64Bit()) {
54777           if (VConstraint && Subtarget.hasVLX())
54778             return std::make_pair(0U, &X86::VR128XRegClass);
54779           return std::make_pair(0U, &X86::VR128RegClass);
54780         }
54781         break;
54782       // Vector types and fp128.
54783       case MVT::v8f16:
54784         if (!Subtarget.hasFP16())
54785           break;
54786         LLVM_FALLTHROUGH;
54787       case MVT::f128:
54788       case MVT::v16i8:
54789       case MVT::v8i16:
54790       case MVT::v4i32:
54791       case MVT::v2i64:
54792       case MVT::v4f32:
54793       case MVT::v2f64:
54794         if (VConstraint && Subtarget.hasVLX())
54795           return std::make_pair(0U, &X86::VR128XRegClass);
54796         return std::make_pair(0U, &X86::VR128RegClass);
54797       // AVX types.
54798       case MVT::v16f16:
54799         if (!Subtarget.hasFP16())
54800           break;
54801         LLVM_FALLTHROUGH;
54802       case MVT::v32i8:
54803       case MVT::v16i16:
54804       case MVT::v8i32:
54805       case MVT::v4i64:
54806       case MVT::v8f32:
54807       case MVT::v4f64:
54808         if (VConstraint && Subtarget.hasVLX())
54809           return std::make_pair(0U, &X86::VR256XRegClass);
54810         if (Subtarget.hasAVX())
54811           return std::make_pair(0U, &X86::VR256RegClass);
54812         break;
54813       case MVT::v32f16:
54814         if (!Subtarget.hasFP16())
54815           break;
54816         LLVM_FALLTHROUGH;
54817       case MVT::v64i8:
54818       case MVT::v32i16:
54819       case MVT::v8f64:
54820       case MVT::v16f32:
54821       case MVT::v16i32:
54822       case MVT::v8i64:
54823         if (!Subtarget.hasAVX512()) break;
54824         if (VConstraint)
54825           return std::make_pair(0U, &X86::VR512RegClass);
54826         return std::make_pair(0U, &X86::VR512_0_15RegClass);
54827       }
54828       break;
54829     }
54830   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
54831     switch (Constraint[1]) {
54832     default:
54833       break;
54834     case 'i':
54835     case 't':
54836     case '2':
54837       return getRegForInlineAsmConstraint(TRI, "x", VT);
54838     case 'm':
54839       if (!Subtarget.hasMMX()) break;
54840       return std::make_pair(0U, &X86::VR64RegClass);
54841     case 'z':
54842       if (!Subtarget.hasSSE1()) break;
54843       switch (VT.SimpleTy) {
54844       default: break;
54845       // Scalar SSE types.
54846       case MVT::f16:
54847         if (!Subtarget.hasFP16())
54848           break;
54849         return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
54850       case MVT::f32:
54851       case MVT::i32:
54852         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
54853       case MVT::f64:
54854       case MVT::i64:
54855         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
54856       case MVT::v8f16:
54857         if (!Subtarget.hasFP16())
54858           break;
54859         LLVM_FALLTHROUGH;
54860       case MVT::f128:
54861       case MVT::v16i8:
54862       case MVT::v8i16:
54863       case MVT::v4i32:
54864       case MVT::v2i64:
54865       case MVT::v4f32:
54866       case MVT::v2f64:
54867         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
54868       // AVX types.
54869       case MVT::v16f16:
54870         if (!Subtarget.hasFP16())
54871           break;
54872         LLVM_FALLTHROUGH;
54873       case MVT::v32i8:
54874       case MVT::v16i16:
54875       case MVT::v8i32:
54876       case MVT::v4i64:
54877       case MVT::v8f32:
54878       case MVT::v4f64:
54879         if (Subtarget.hasAVX())
54880           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
54881         break;
54882       case MVT::v32f16:
54883         if (!Subtarget.hasFP16())
54884           break;
54885         LLVM_FALLTHROUGH;
54886       case MVT::v64i8:
54887       case MVT::v32i16:
54888       case MVT::v8f64:
54889       case MVT::v16f32:
54890       case MVT::v16i32:
54891       case MVT::v8i64:
54892         if (Subtarget.hasAVX512())
54893           return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
54894         break;
54895       }
54896       break;
54897     case 'k':
54898       // This register class doesn't allocate k0 for masked vector operation.
54899       if (Subtarget.hasAVX512()) {
54900         if (VT == MVT::i1)
54901           return std::make_pair(0U, &X86::VK1WMRegClass);
54902         if (VT == MVT::i8)
54903           return std::make_pair(0U, &X86::VK8WMRegClass);
54904         if (VT == MVT::i16)
54905           return std::make_pair(0U, &X86::VK16WMRegClass);
54906       }
54907       if (Subtarget.hasBWI()) {
54908         if (VT == MVT::i32)
54909           return std::make_pair(0U, &X86::VK32WMRegClass);
54910         if (VT == MVT::i64)
54911           return std::make_pair(0U, &X86::VK64WMRegClass);
54912       }
54913       break;
54914     }
54915   }
54916 
54917   if (parseConstraintCode(Constraint) != X86::COND_INVALID)
54918     return std::make_pair(0U, &X86::GR32RegClass);
54919 
54920   // Use the default implementation in TargetLowering to convert the register
54921   // constraint into a member of a register class.
54922   std::pair<Register, const TargetRegisterClass*> Res;
54923   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
54924 
54925   // Not found as a standard register?
54926   if (!Res.second) {
54927     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
54928     // to/from f80.
54929     if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
54930       // Map st(0) -> st(7) -> ST0
54931       if (Constraint.size() == 7 && Constraint[0] == '{' &&
54932           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
54933           Constraint[3] == '(' &&
54934           (Constraint[4] >= '0' && Constraint[4] <= '7') &&
54935           Constraint[5] == ')' && Constraint[6] == '}') {
54936         // st(7) is not allocatable and thus not a member of RFP80. Return
54937         // singleton class in cases where we have a reference to it.
54938         if (Constraint[4] == '7')
54939           return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
54940         return std::make_pair(X86::FP0 + Constraint[4] - '0',
54941                               &X86::RFP80RegClass);
54942       }
54943 
54944       // GCC allows "st(0)" to be called just plain "st".
54945       if (StringRef("{st}").equals_insensitive(Constraint))
54946         return std::make_pair(X86::FP0, &X86::RFP80RegClass);
54947     }
54948 
54949     // flags -> EFLAGS
54950     if (StringRef("{flags}").equals_insensitive(Constraint))
54951       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
54952 
54953     // dirflag -> DF
54954     // Only allow for clobber.
54955     if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
54956         VT == MVT::Other)
54957       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
54958 
54959     // fpsr -> FPSW
54960     if (StringRef("{fpsr}").equals_insensitive(Constraint))
54961       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
54962 
54963     return Res;
54964   }
54965 
54966   // Make sure it isn't a register that requires 64-bit mode.
54967   if (!Subtarget.is64Bit() &&
54968       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
54969       TRI->getEncodingValue(Res.first) >= 8) {
54970     // Register requires REX prefix, but we're in 32-bit mode.
54971     return std::make_pair(0, nullptr);
54972   }
54973 
54974   // Make sure it isn't a register that requires AVX512.
54975   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
54976       TRI->getEncodingValue(Res.first) & 0x10) {
54977     // Register requires EVEX prefix.
54978     return std::make_pair(0, nullptr);
54979   }
54980 
54981   // Otherwise, check to see if this is a register class of the wrong value
54982   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
54983   // turn into {ax},{dx}.
54984   // MVT::Other is used to specify clobber names.
54985   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
54986     return Res;   // Correct type already, nothing to do.
54987 
54988   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
54989   // return "eax". This should even work for things like getting 64bit integer
54990   // registers when given an f64 type.
54991   const TargetRegisterClass *Class = Res.second;
54992   // The generic code will match the first register class that contains the
54993   // given register. Thus, based on the ordering of the tablegened file,
54994   // the "plain" GR classes might not come first.
54995   // Therefore, use a helper method.
54996   if (isGRClass(*Class)) {
54997     unsigned Size = VT.getSizeInBits();
54998     if (Size == 1) Size = 8;
54999     Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
55000     if (DestReg > 0) {
55001       bool is64Bit = Subtarget.is64Bit();
55002       const TargetRegisterClass *RC =
55003           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
55004         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
55005         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
55006         : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
55007         : nullptr;
55008       if (Size == 64 && !is64Bit) {
55009         // Model GCC's behavior here and select a fixed pair of 32-bit
55010         // registers.
55011         switch (DestReg) {
55012         case X86::RAX:
55013           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
55014         case X86::RDX:
55015           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
55016         case X86::RCX:
55017           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
55018         case X86::RBX:
55019           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
55020         case X86::RSI:
55021           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
55022         case X86::RDI:
55023           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
55024         case X86::RBP:
55025           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
55026         default:
55027           return std::make_pair(0, nullptr);
55028         }
55029       }
55030       if (RC && RC->contains(DestReg))
55031         return std::make_pair(DestReg, RC);
55032       return Res;
55033     }
55034     // No register found/type mismatch.
55035     return std::make_pair(0, nullptr);
55036   } else if (isFRClass(*Class)) {
55037     // Handle references to XMM physical registers that got mapped into the
55038     // wrong class.  This can happen with constraints like {xmm0} where the
55039     // target independent register mapper will just pick the first match it can
55040     // find, ignoring the required type.
55041 
55042     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
55043     if (VT == MVT::f16)
55044       Res.second = &X86::FR16XRegClass;
55045     else if (VT == MVT::f32 || VT == MVT::i32)
55046       Res.second = &X86::FR32XRegClass;
55047     else if (VT == MVT::f64 || VT == MVT::i64)
55048       Res.second = &X86::FR64XRegClass;
55049     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
55050       Res.second = &X86::VR128XRegClass;
55051     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
55052       Res.second = &X86::VR256XRegClass;
55053     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
55054       Res.second = &X86::VR512RegClass;
55055     else {
55056       // Type mismatch and not a clobber: Return an error;
55057       Res.first = 0;
55058       Res.second = nullptr;
55059     }
55060   } else if (isVKClass(*Class)) {
55061     if (VT == MVT::i1)
55062       Res.second = &X86::VK1RegClass;
55063     else if (VT == MVT::i8)
55064       Res.second = &X86::VK8RegClass;
55065     else if (VT == MVT::i16)
55066       Res.second = &X86::VK16RegClass;
55067     else if (VT == MVT::i32)
55068       Res.second = &X86::VK32RegClass;
55069     else if (VT == MVT::i64)
55070       Res.second = &X86::VK64RegClass;
55071     else {
55072       // Type mismatch and not a clobber: Return an error;
55073       Res.first = 0;
55074       Res.second = nullptr;
55075     }
55076   }
55077 
55078   return Res;
55079 }
55080 
55081 InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
55082                                                         const AddrMode &AM,
55083                                                         Type *Ty,
55084                                                         unsigned AS) const {
55085   // Scaling factors are not free at all.
55086   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
55087   // will take 2 allocations in the out of order engine instead of 1
55088   // for plain addressing mode, i.e. inst (reg1).
55089   // E.g.,
55090   // vaddps (%rsi,%rdx), %ymm0, %ymm1
55091   // Requires two allocations (one for the load, one for the computation)
55092   // whereas:
55093   // vaddps (%rsi), %ymm0, %ymm1
55094   // Requires just 1 allocation, i.e., freeing allocations for other operations
55095   // and having less micro operations to execute.
55096   //
55097   // For some X86 architectures, this is even worse because for instance for
55098   // stores, the complex addressing mode forces the instruction to use the
55099   // "load" ports instead of the dedicated "store" port.
55100   // E.g., on Haswell:
55101   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
55102   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
55103   if (isLegalAddressingMode(DL, AM, Ty, AS))
55104     // Scale represents reg2 * scale, thus account for 1
55105     // as soon as we use a second register.
55106     return AM.Scale != 0;
55107   return -1;
55108 }
55109 
55110 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
55111   // Integer division on x86 is expensive. However, when aggressively optimizing
55112   // for code size, we prefer to use a div instruction, as it is usually smaller
55113   // than the alternative sequence.
55114   // The exception to this is vector division. Since x86 doesn't have vector
55115   // integer division, leaving the division as-is is a loss even in terms of
55116   // size, because it will have to be scalarized, while the alternative code
55117   // sequence can be performed in vector form.
55118   bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
55119   return OptSize && !VT.isVector();
55120 }
55121 
55122 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
55123   if (!Subtarget.is64Bit())
55124     return;
55125 
55126   // Update IsSplitCSR in X86MachineFunctionInfo.
55127   X86MachineFunctionInfo *AFI =
55128       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
55129   AFI->setIsSplitCSR(true);
55130 }
55131 
55132 void X86TargetLowering::insertCopiesSplitCSR(
55133     MachineBasicBlock *Entry,
55134     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
55135   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
55136   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
55137   if (!IStart)
55138     return;
55139 
55140   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
55141   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
55142   MachineBasicBlock::iterator MBBI = Entry->begin();
55143   for (const MCPhysReg *I = IStart; *I; ++I) {
55144     const TargetRegisterClass *RC = nullptr;
55145     if (X86::GR64RegClass.contains(*I))
55146       RC = &X86::GR64RegClass;
55147     else
55148       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
55149 
55150     Register NewVR = MRI->createVirtualRegister(RC);
55151     // Create copy from CSR to a virtual register.
55152     // FIXME: this currently does not emit CFI pseudo-instructions, it works
55153     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
55154     // nounwind. If we want to generalize this later, we may need to emit
55155     // CFI pseudo-instructions.
55156     assert(
55157         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
55158         "Function should be nounwind in insertCopiesSplitCSR!");
55159     Entry->addLiveIn(*I);
55160     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
55161         .addReg(*I);
55162 
55163     // Insert the copy-back instructions right before the terminator.
55164     for (auto *Exit : Exits)
55165       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
55166               TII->get(TargetOpcode::COPY), *I)
55167           .addReg(NewVR);
55168   }
55169 }
55170 
55171 bool X86TargetLowering::supportSwiftError() const {
55172   return Subtarget.is64Bit();
55173 }
55174 
55175 /// Returns true if stack probing through a function call is requested.
55176 bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
55177   return !getStackProbeSymbolName(MF).empty();
55178 }
55179 
55180 /// Returns true if stack probing through inline assembly is requested.
55181 bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
55182 
55183   // No inline stack probe for Windows, they have their own mechanism.
55184   if (Subtarget.isOSWindows() ||
55185       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
55186     return false;
55187 
55188   // If the function specifically requests inline stack probes, emit them.
55189   if (MF.getFunction().hasFnAttribute("probe-stack"))
55190     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
55191            "inline-asm";
55192 
55193   return false;
55194 }
55195 
55196 /// Returns the name of the symbol used to emit stack probes or the empty
55197 /// string if not applicable.
55198 StringRef
55199 X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
55200   // Inline Stack probes disable stack probe call
55201   if (hasInlineStackProbe(MF))
55202     return "";
55203 
55204   // If the function specifically requests stack probes, emit them.
55205   if (MF.getFunction().hasFnAttribute("probe-stack"))
55206     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
55207 
55208   // Generally, if we aren't on Windows, the platform ABI does not include
55209   // support for stack probes, so don't emit them.
55210   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
55211       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
55212     return "";
55213 
55214   // We need a stack probe to conform to the Windows ABI. Choose the right
55215   // symbol.
55216   if (Subtarget.is64Bit())
55217     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
55218   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
55219 }
55220 
55221 unsigned
55222 X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
55223   // The default stack probe size is 4096 if the function has no stackprobesize
55224   // attribute.
55225   unsigned StackProbeSize = 4096;
55226   const Function &Fn = MF.getFunction();
55227   if (Fn.hasFnAttribute("stack-probe-size"))
55228     Fn.getFnAttribute("stack-probe-size")
55229         .getValueAsString()
55230         .getAsInteger(0, StackProbeSize);
55231   return StackProbeSize;
55232 }
55233 
55234 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
55235   if (ML->isInnermost() &&
55236       ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
55237     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
55238   return TargetLowering::getPrefLoopAlignment();
55239 }
55240