1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelLowering.h"
15 #include "MCTargetDesc/X86ShuffleDecode.h"
16 #include "X86.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/BlockFrequencyInfo.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/Analysis/ObjCARCUtil.h"
32 #include "llvm/Analysis/ProfileSummaryInfo.h"
33 #include "llvm/Analysis/VectorUtils.h"
34 #include "llvm/CodeGen/IntrinsicLowering.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineInstrBuilder.h"
38 #include "llvm/CodeGen/MachineJumpTableInfo.h"
39 #include "llvm/CodeGen/MachineLoopInfo.h"
40 #include "llvm/CodeGen/MachineModuleInfo.h"
41 #include "llvm/CodeGen/MachineRegisterInfo.h"
42 #include "llvm/CodeGen/TargetLowering.h"
43 #include "llvm/CodeGen/WinEHFuncInfo.h"
44 #include "llvm/IR/CallingConv.h"
45 #include "llvm/IR/Constants.h"
46 #include "llvm/IR/DerivedTypes.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/GlobalAlias.h"
50 #include "llvm/IR/GlobalVariable.h"
51 #include "llvm/IR/Instructions.h"
52 #include "llvm/IR/Intrinsics.h"
53 #include "llvm/IR/IRBuilder.h"
54 #include "llvm/MC/MCAsmInfo.h"
55 #include "llvm/MC/MCContext.h"
56 #include "llvm/MC/MCExpr.h"
57 #include "llvm/MC/MCSymbol.h"
58 #include "llvm/Support/CommandLine.h"
59 #include "llvm/Support/Debug.h"
60 #include "llvm/Support/ErrorHandling.h"
61 #include "llvm/Support/KnownBits.h"
62 #include "llvm/Support/MathExtras.h"
63 #include "llvm/Target/TargetOptions.h"
64 #include <algorithm>
65 #include <bitset>
66 #include <cctype>
67 #include <numeric>
68 using namespace llvm;
69 
70 #define DEBUG_TYPE "x86-isel"
71 
72 STATISTIC(NumTailCalls, "Number of tail calls");
73 
74 static cl::opt<int> ExperimentalPrefLoopAlignment(
75     "x86-experimental-pref-loop-alignment", cl::init(4),
76     cl::desc(
77         "Sets the preferable loop alignment for experiments (as log2 bytes)"
78         "(the last x86-experimental-pref-loop-alignment bits"
79         " of the loop header PC will be 0)."),
80     cl::Hidden);
81 
82 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
83     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
84     cl::desc(
85         "Sets the preferable loop alignment for experiments (as log2 bytes) "
86         "for innermost loops only. If specified, this option overrides "
87         "alignment set by x86-experimental-pref-loop-alignment."),
88     cl::Hidden);
89 
90 static cl::opt<bool> MulConstantOptimization(
91     "mul-constant-optimization", cl::init(true),
92     cl::desc("Replace 'mul x, Const' with more effective instructions like "
93              "SHIFT, LEA, etc."),
94     cl::Hidden);
95 
96 static cl::opt<bool> ExperimentalUnorderedISEL(
97     "x86-experimental-unordered-atomic-isel", cl::init(false),
98     cl::desc("Use LoadSDNode and StoreSDNode instead of "
99              "AtomicSDNode for unordered atomic loads and "
100              "stores respectively."),
101     cl::Hidden);
102 
103 /// Call this when the user attempts to do something unsupported, like
104 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
105 /// report_fatal_error, so calling code should attempt to recover without
106 /// crashing.
107 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
108                              const char *Msg) {
109   MachineFunction &MF = DAG.getMachineFunction();
110   DAG.getContext()->diagnose(
111       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
112 }
113 
114 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
115                                      const X86Subtarget &STI)
116     : TargetLowering(TM), Subtarget(STI) {
117   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
118   X86ScalarSSEf64 = Subtarget.hasSSE2();
119   X86ScalarSSEf32 = Subtarget.hasSSE1();
120   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
121 
122   // Set up the TargetLowering object.
123 
124   // X86 is weird. It always uses i8 for shift amounts and setcc results.
125   setBooleanContents(ZeroOrOneBooleanContent);
126   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
127   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
128 
129   // For 64-bit, since we have so many registers, use the ILP scheduler.
130   // For 32-bit, use the register pressure specific scheduling.
131   // For Atom, always use ILP scheduling.
132   if (Subtarget.isAtom())
133     setSchedulingPreference(Sched::ILP);
134   else if (Subtarget.is64Bit())
135     setSchedulingPreference(Sched::ILP);
136   else
137     setSchedulingPreference(Sched::RegPressure);
138   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
139   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
140 
141   // Bypass expensive divides and use cheaper ones.
142   if (TM.getOptLevel() >= CodeGenOpt::Default) {
143     if (Subtarget.hasSlowDivide32())
144       addBypassSlowDiv(32, 8);
145     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
146       addBypassSlowDiv(64, 32);
147   }
148 
149   // Setup Windows compiler runtime calls.
150   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
151     static const struct {
152       const RTLIB::Libcall Op;
153       const char * const Name;
154       const CallingConv::ID CC;
155     } LibraryCalls[] = {
156       { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
157       { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
158       { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
159       { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
160       { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
161     };
162 
163     for (const auto &LC : LibraryCalls) {
164       setLibcallName(LC.Op, LC.Name);
165       setLibcallCallingConv(LC.Op, LC.CC);
166     }
167   }
168 
169   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
170     // MSVCRT doesn't have powi; fall back to pow
171     setLibcallName(RTLIB::POWI_F32, nullptr);
172     setLibcallName(RTLIB::POWI_F64, nullptr);
173   }
174 
175   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
176   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
177   // FIXME: Should we be limiting the atomic size on other configs? Default is
178   // 1024.
179   if (!Subtarget.hasCmpxchg8b())
180     setMaxAtomicSizeInBitsSupported(32);
181 
182   // Set up the register classes.
183   addRegisterClass(MVT::i8, &X86::GR8RegClass);
184   addRegisterClass(MVT::i16, &X86::GR16RegClass);
185   addRegisterClass(MVT::i32, &X86::GR32RegClass);
186   if (Subtarget.is64Bit())
187     addRegisterClass(MVT::i64, &X86::GR64RegClass);
188 
189   for (MVT VT : MVT::integer_valuetypes())
190     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
191 
192   // We don't accept any truncstore of integer registers.
193   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
194   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
195   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
196   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
197   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
198   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
199 
200   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
201 
202   // SETOEQ and SETUNE require checking two conditions.
203   for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204     setCondCodeAction(ISD::SETOEQ, VT, Expand);
205     setCondCodeAction(ISD::SETUNE, VT, Expand);
206   }
207 
208   // Integer absolute.
209   if (Subtarget.hasCMov()) {
210     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
211     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
212     if (Subtarget.is64Bit())
213       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
214   }
215 
216   // Funnel shifts.
217   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
218     // For slow shld targets we only lower for code size.
219     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
220 
221     setOperationAction(ShiftOp             , MVT::i8   , Custom);
222     setOperationAction(ShiftOp             , MVT::i16  , Custom);
223     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
224     if (Subtarget.is64Bit())
225       setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
226   }
227 
228   if (!Subtarget.useSoftFloat()) {
229     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
230     // operation.
231     setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
232     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
233     setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
234     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
235     // We have an algorithm for SSE2, and we turn this into a 64-bit
236     // FILD or VCVTUSI2SS/SD for other targets.
237     setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
238     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
239     // We have an algorithm for SSE2->double, and we turn this into a
240     // 64-bit FILD followed by conditional FADD for other targets.
241     setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
242     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
243 
244     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
245     // this operation.
246     setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
247     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
248     // SSE has no i16 to fp conversion, only i32. We promote in the handler
249     // to allow f80 to use i16 and f64 to use i16 with sse1 only
250     setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
251     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
252     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
253     setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
254     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
255     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
256     // are Legal, f80 is custom lowered.
257     setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
258     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
259 
260     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
261     // this operation.
262     setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
263     // FIXME: This doesn't generate invalid exception when it should. PR44019.
264     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
265     setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
266     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
267     setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
268     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
269     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
270     // are Legal, f80 is custom lowered.
271     setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
272     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
273 
274     // Handle FP_TO_UINT by promoting the destination to a larger signed
275     // conversion.
276     setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
277     // FIXME: This doesn't generate invalid exception when it should. PR44019.
278     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
279     setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
280     // FIXME: This doesn't generate invalid exception when it should. PR44019.
281     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
282     setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
283     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
284     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
285     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
286 
287     setOperationAction(ISD::LRINT,             MVT::f32, Custom);
288     setOperationAction(ISD::LRINT,             MVT::f64, Custom);
289     setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
290     setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
291 
292     if (!Subtarget.is64Bit()) {
293       setOperationAction(ISD::LRINT,  MVT::i64, Custom);
294       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
295     }
296   }
297 
298   if (Subtarget.hasSSE2()) {
299     // Custom lowering for saturating float to int conversions.
300     // We handle promotion to larger result types manually.
301     for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
302       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
303       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
304     }
305     if (Subtarget.is64Bit()) {
306       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
307       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
308     }
309   }
310 
311   // Handle address space casts between mixed sized pointers.
312   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
313   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
314 
315   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
316   if (!X86ScalarSSEf64) {
317     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
318     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
319     if (Subtarget.is64Bit()) {
320       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
321       // Without SSE, i64->f64 goes through memory.
322       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
323     }
324   } else if (!Subtarget.is64Bit())
325     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
326 
327   // Scalar integer divide and remainder are lowered to use operations that
328   // produce two results, to match the available instructions. This exposes
329   // the two-result form to trivial CSE, which is able to combine x/y and x%y
330   // into a single instruction.
331   //
332   // Scalar integer multiply-high is also lowered to use two-result
333   // operations, to match the available instructions. However, plain multiply
334   // (low) operations are left as Legal, as there are single-result
335   // instructions for this in x86. Using the two-result multiply instructions
336   // when both high and low results are needed must be arranged by dagcombine.
337   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
338     setOperationAction(ISD::MULHS, VT, Expand);
339     setOperationAction(ISD::MULHU, VT, Expand);
340     setOperationAction(ISD::SDIV, VT, Expand);
341     setOperationAction(ISD::UDIV, VT, Expand);
342     setOperationAction(ISD::SREM, VT, Expand);
343     setOperationAction(ISD::UREM, VT, Expand);
344   }
345 
346   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
347   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
348   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
349                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
350     setOperationAction(ISD::BR_CC,     VT, Expand);
351     setOperationAction(ISD::SELECT_CC, VT, Expand);
352   }
353   if (Subtarget.is64Bit())
354     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
355   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
356   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
357   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
358 
359   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
360   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
361   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
362   setOperationAction(ISD::FREM             , MVT::f128 , Expand);
363 
364   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
365     setOperationAction(ISD::FLT_ROUNDS_    , MVT::i32  , Custom);
366     setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
367   }
368 
369   // Promote the i8 variants and force them on up to i32 which has a shorter
370   // encoding.
371   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
372   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
373 
374   if (Subtarget.hasBMI()) {
375     // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
376     // is enabled.
377     setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
378   } else {
379     setOperationAction(ISD::CTTZ, MVT::i16, Custom);
380     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
381     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
382     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
383     if (Subtarget.is64Bit()) {
384       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
385       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
386     }
387   }
388 
389   if (Subtarget.hasLZCNT()) {
390     // When promoting the i8 variants, force them to i32 for a shorter
391     // encoding.
392     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
393     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
394   } else {
395     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
396       if (VT == MVT::i64 && !Subtarget.is64Bit())
397         continue;
398       setOperationAction(ISD::CTLZ           , VT, Custom);
399       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
400     }
401   }
402 
403   for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
404                   ISD::STRICT_FP_TO_FP16}) {
405     // Special handling for half-precision floating point conversions.
406     // If we don't have F16C support, then lower half float conversions
407     // into library calls.
408     setOperationAction(
409         Op, MVT::f32,
410         (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
411     // There's never any support for operations beyond MVT::f32.
412     setOperationAction(Op, MVT::f64, Expand);
413     setOperationAction(Op, MVT::f80, Expand);
414     setOperationAction(Op, MVT::f128, Expand);
415   }
416 
417   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
418   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
419   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
420   setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
421   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
422   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
423   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
424   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
425 
426   setOperationAction(ISD::PARITY, MVT::i8, Custom);
427   if (Subtarget.hasPOPCNT()) {
428     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
429   } else {
430     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
431     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
432     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
433     if (Subtarget.is64Bit())
434       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
435     else
436       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
437 
438     setOperationAction(ISD::PARITY, MVT::i16, Custom);
439     setOperationAction(ISD::PARITY, MVT::i32, Custom);
440     if (Subtarget.is64Bit())
441       setOperationAction(ISD::PARITY, MVT::i64, Custom);
442   }
443 
444   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
445 
446   if (!Subtarget.hasMOVBE())
447     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
448 
449   // X86 wants to expand cmov itself.
450   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451     setOperationAction(ISD::SELECT, VT, Custom);
452     setOperationAction(ISD::SETCC, VT, Custom);
453     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455   }
456   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457     if (VT == MVT::i64 && !Subtarget.is64Bit())
458       continue;
459     setOperationAction(ISD::SELECT, VT, Custom);
460     setOperationAction(ISD::SETCC,  VT, Custom);
461   }
462 
463   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466 
467   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
468   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475 
476   // Darwin ABI issue.
477   for (auto VT : { MVT::i32, MVT::i64 }) {
478     if (VT == MVT::i64 && !Subtarget.is64Bit())
479       continue;
480     setOperationAction(ISD::ConstantPool    , VT, Custom);
481     setOperationAction(ISD::JumpTable       , VT, Custom);
482     setOperationAction(ISD::GlobalAddress   , VT, Custom);
483     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
485     setOperationAction(ISD::BlockAddress    , VT, Custom);
486   }
487 
488   // 64-bit shl, sra, srl (iff 32-bit x86)
489   for (auto VT : { MVT::i32, MVT::i64 }) {
490     if (VT == MVT::i64 && !Subtarget.is64Bit())
491       continue;
492     setOperationAction(ISD::SHL_PARTS, VT, Custom);
493     setOperationAction(ISD::SRA_PARTS, VT, Custom);
494     setOperationAction(ISD::SRL_PARTS, VT, Custom);
495   }
496 
497   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
498     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
499 
500   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
501 
502   // Expand certain atomics
503   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511   }
512 
513   if (!Subtarget.is64Bit())
514     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515 
516   if (Subtarget.hasCmpxchg16b()) {
517     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518   }
519 
520   // FIXME - use subtarget debug flags
521   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
522       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
523       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
524     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
525   }
526 
527   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
528   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
529 
530   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
531   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
532 
533   setOperationAction(ISD::TRAP, MVT::Other, Legal);
534   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
535   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
536 
537   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
538   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
539   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
540   bool Is64Bit = Subtarget.is64Bit();
541   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
542   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
543 
544   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
545   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
546 
547   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
548 
549   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
550   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
551   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
552 
553   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
554     // f32 and f64 use SSE.
555     // Set up the FP register classes.
556     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
557                                                      : &X86::FR32RegClass);
558     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
559                                                      : &X86::FR64RegClass);
560 
561     // Disable f32->f64 extload as we can only generate this in one instruction
562     // under optsize. So its easier to pattern match (fpext (load)) for that
563     // case instead of needing to emit 2 instructions for extload in the
564     // non-optsize case.
565     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
566 
567     for (auto VT : { MVT::f32, MVT::f64 }) {
568       // Use ANDPD to simulate FABS.
569       setOperationAction(ISD::FABS, VT, Custom);
570 
571       // Use XORP to simulate FNEG.
572       setOperationAction(ISD::FNEG, VT, Custom);
573 
574       // Use ANDPD and ORPD to simulate FCOPYSIGN.
575       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
576 
577       // These might be better off as horizontal vector ops.
578       setOperationAction(ISD::FADD, VT, Custom);
579       setOperationAction(ISD::FSUB, VT, Custom);
580 
581       // We don't support sin/cos/fmod
582       setOperationAction(ISD::FSIN   , VT, Expand);
583       setOperationAction(ISD::FCOS   , VT, Expand);
584       setOperationAction(ISD::FSINCOS, VT, Expand);
585     }
586 
587     // Lower this to MOVMSK plus an AND.
588     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
589     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
590 
591   } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
592              (UseX87 || Is64Bit)) {
593     // Use SSE for f32, x87 for f64.
594     // Set up the FP register classes.
595     addRegisterClass(MVT::f32, &X86::FR32RegClass);
596     if (UseX87)
597       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
598 
599     // Use ANDPS to simulate FABS.
600     setOperationAction(ISD::FABS , MVT::f32, Custom);
601 
602     // Use XORP to simulate FNEG.
603     setOperationAction(ISD::FNEG , MVT::f32, Custom);
604 
605     if (UseX87)
606       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
607 
608     // Use ANDPS and ORPS to simulate FCOPYSIGN.
609     if (UseX87)
610       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
611     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
612 
613     // We don't support sin/cos/fmod
614     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
615     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
616     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
617 
618     if (UseX87) {
619       // Always expand sin/cos functions even though x87 has an instruction.
620       setOperationAction(ISD::FSIN, MVT::f64, Expand);
621       setOperationAction(ISD::FCOS, MVT::f64, Expand);
622       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
623     }
624   } else if (UseX87) {
625     // f32 and f64 in x87.
626     // Set up the FP register classes.
627     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
628     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
629 
630     for (auto VT : { MVT::f32, MVT::f64 }) {
631       setOperationAction(ISD::UNDEF,     VT, Expand);
632       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
633 
634       // Always expand sin/cos functions even though x87 has an instruction.
635       setOperationAction(ISD::FSIN   , VT, Expand);
636       setOperationAction(ISD::FCOS   , VT, Expand);
637       setOperationAction(ISD::FSINCOS, VT, Expand);
638     }
639   }
640 
641   // Expand FP32 immediates into loads from the stack, save special cases.
642   if (isTypeLegal(MVT::f32)) {
643     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
644       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
645       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
646       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
647       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
648     } else // SSE immediates.
649       addLegalFPImmediate(APFloat(+0.0f)); // xorps
650   }
651   // Expand FP64 immediates into loads from the stack, save special cases.
652   if (isTypeLegal(MVT::f64)) {
653     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
654       addLegalFPImmediate(APFloat(+0.0)); // FLD0
655       addLegalFPImmediate(APFloat(+1.0)); // FLD1
656       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
657       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
658     } else // SSE immediates.
659       addLegalFPImmediate(APFloat(+0.0)); // xorpd
660   }
661   // Handle constrained floating-point operations of scalar.
662   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
663   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
664   setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
665   setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
666   setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
667   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
668   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
669   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
670   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
671   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
672   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
673   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
674   setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
675 
676   // We don't support FMA.
677   setOperationAction(ISD::FMA, MVT::f64, Expand);
678   setOperationAction(ISD::FMA, MVT::f32, Expand);
679 
680   // f80 always uses X87.
681   if (UseX87) {
682     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
683     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
684     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
685     {
686       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
687       addLegalFPImmediate(TmpFlt);  // FLD0
688       TmpFlt.changeSign();
689       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
690 
691       bool ignored;
692       APFloat TmpFlt2(+1.0);
693       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
694                       &ignored);
695       addLegalFPImmediate(TmpFlt2);  // FLD1
696       TmpFlt2.changeSign();
697       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
698     }
699 
700     // Always expand sin/cos functions even though x87 has an instruction.
701     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
702     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
703     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
704 
705     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
706     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
707     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
708     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
709     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
710     setOperationAction(ISD::FMA, MVT::f80, Expand);
711     setOperationAction(ISD::LROUND, MVT::f80, Expand);
712     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
713     setOperationAction(ISD::LRINT, MVT::f80, Custom);
714     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
715 
716     // Handle constrained floating-point operations of scalar.
717     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
718     setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
719     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
720     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
721     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
722     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
723     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
724     // as Custom.
725     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
726   }
727 
728   // f128 uses xmm registers, but most operations require libcalls.
729   if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
730     addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
731                                                    : &X86::VR128RegClass);
732 
733     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
734 
735     setOperationAction(ISD::FADD,        MVT::f128, LibCall);
736     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
737     setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
738     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
739     setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
740     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
741     setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
742     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
743     setOperationAction(ISD::FMA,         MVT::f128, LibCall);
744     setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
745 
746     setOperationAction(ISD::FABS, MVT::f128, Custom);
747     setOperationAction(ISD::FNEG, MVT::f128, Custom);
748     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
749 
750     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
751     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
752     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
753     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
754     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
755     // No STRICT_FSINCOS
756     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
757     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
758 
759     setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
760     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
761     // We need to custom handle any FP_ROUND with an f128 input, but
762     // LegalizeDAG uses the result type to know when to run a custom handler.
763     // So we have to list all legal floating point result types here.
764     if (isTypeLegal(MVT::f32)) {
765       setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
766       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
767     }
768     if (isTypeLegal(MVT::f64)) {
769       setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
770       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
771     }
772     if (isTypeLegal(MVT::f80)) {
773       setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
774       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
775     }
776 
777     setOperationAction(ISD::SETCC, MVT::f128, Custom);
778 
779     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
780     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
781     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
782     setTruncStoreAction(MVT::f128, MVT::f32, Expand);
783     setTruncStoreAction(MVT::f128, MVT::f64, Expand);
784     setTruncStoreAction(MVT::f128, MVT::f80, Expand);
785   }
786 
787   // Always use a library call for pow.
788   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
789   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
790   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
791   setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
792 
793   setOperationAction(ISD::FLOG, MVT::f80, Expand);
794   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
795   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
796   setOperationAction(ISD::FEXP, MVT::f80, Expand);
797   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
798   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
799   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
800 
801   // Some FP actions are always expanded for vector types.
802   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
803                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
804     setOperationAction(ISD::FSIN,      VT, Expand);
805     setOperationAction(ISD::FSINCOS,   VT, Expand);
806     setOperationAction(ISD::FCOS,      VT, Expand);
807     setOperationAction(ISD::FREM,      VT, Expand);
808     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
809     setOperationAction(ISD::FPOW,      VT, Expand);
810     setOperationAction(ISD::FLOG,      VT, Expand);
811     setOperationAction(ISD::FLOG2,     VT, Expand);
812     setOperationAction(ISD::FLOG10,    VT, Expand);
813     setOperationAction(ISD::FEXP,      VT, Expand);
814     setOperationAction(ISD::FEXP2,     VT, Expand);
815   }
816 
817   // First set operation action for all vector types to either promote
818   // (for widening) or expand (for scalarization). Then we will selectively
819   // turn on ones that can be effectively codegen'd.
820   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
821     setOperationAction(ISD::SDIV, VT, Expand);
822     setOperationAction(ISD::UDIV, VT, Expand);
823     setOperationAction(ISD::SREM, VT, Expand);
824     setOperationAction(ISD::UREM, VT, Expand);
825     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
826     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
827     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
828     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
829     setOperationAction(ISD::FMA,  VT, Expand);
830     setOperationAction(ISD::FFLOOR, VT, Expand);
831     setOperationAction(ISD::FCEIL, VT, Expand);
832     setOperationAction(ISD::FTRUNC, VT, Expand);
833     setOperationAction(ISD::FRINT, VT, Expand);
834     setOperationAction(ISD::FNEARBYINT, VT, Expand);
835     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
836     setOperationAction(ISD::MULHS, VT, Expand);
837     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
838     setOperationAction(ISD::MULHU, VT, Expand);
839     setOperationAction(ISD::SDIVREM, VT, Expand);
840     setOperationAction(ISD::UDIVREM, VT, Expand);
841     setOperationAction(ISD::CTPOP, VT, Expand);
842     setOperationAction(ISD::CTTZ, VT, Expand);
843     setOperationAction(ISD::CTLZ, VT, Expand);
844     setOperationAction(ISD::ROTL, VT, Expand);
845     setOperationAction(ISD::ROTR, VT, Expand);
846     setOperationAction(ISD::BSWAP, VT, Expand);
847     setOperationAction(ISD::SETCC, VT, Expand);
848     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
849     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
850     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
851     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
852     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
853     setOperationAction(ISD::TRUNCATE, VT, Expand);
854     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
855     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
856     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
857     setOperationAction(ISD::SELECT_CC, VT, Expand);
858     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
859       setTruncStoreAction(InnerVT, VT, Expand);
860 
861       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
862       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
863 
864       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
865       // types, we have to deal with them whether we ask for Expansion or not.
866       // Setting Expand causes its own optimisation problems though, so leave
867       // them legal.
868       if (VT.getVectorElementType() == MVT::i1)
869         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
870 
871       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
872       // split/scalarized right now.
873       if (VT.getVectorElementType() == MVT::f16)
874         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
875     }
876   }
877 
878   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879   // with -msoft-float, disable use of MMX as well.
880   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
881     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882     // No operations on x86mmx supported, everything uses intrinsics.
883   }
884 
885   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
886     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
887                                                     : &X86::VR128RegClass);
888 
889     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
890     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
891     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
892     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
893     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
894     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
895     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
896     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
897 
898     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
899     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
900 
901     setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
902     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
903     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
904     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
905     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
906   }
907 
908   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
909     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
910                                                     : &X86::VR128RegClass);
911 
912     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
913     // registers cannot be used even for integer operations.
914     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
915                                                     : &X86::VR128RegClass);
916     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
917                                                     : &X86::VR128RegClass);
918     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
919                                                     : &X86::VR128RegClass);
920     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
921                                                     : &X86::VR128RegClass);
922 
923     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
924                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
925       setOperationAction(ISD::SDIV, VT, Custom);
926       setOperationAction(ISD::SREM, VT, Custom);
927       setOperationAction(ISD::UDIV, VT, Custom);
928       setOperationAction(ISD::UREM, VT, Custom);
929     }
930 
931     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
932     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
933     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
934 
935     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
936     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
937     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
938     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
939     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
940     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
941     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
942     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
943     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
944     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
945 
946     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
947     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
948 
949     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
950     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
951     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
952 
953     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
954       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
955       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
956       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
957       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
958     }
959 
960     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
961     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
962     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
963     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
964     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
965     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
966     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
967     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
968     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
969     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
970 
971     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
972     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
973     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
974 
975     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
976       setOperationAction(ISD::SETCC,              VT, Custom);
977       setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
978       setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
979       setOperationAction(ISD::CTPOP,              VT, Custom);
980       setOperationAction(ISD::ABS,                VT, Custom);
981 
982       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
983       // setcc all the way to isel and prefer SETGT in some isel patterns.
984       setCondCodeAction(ISD::SETLT, VT, Custom);
985       setCondCodeAction(ISD::SETLE, VT, Custom);
986     }
987 
988     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
989       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
990       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
991       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
992       setOperationAction(ISD::VSELECT,            VT, Custom);
993       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
994     }
995 
996     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
997       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
998       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
999       setOperationAction(ISD::VSELECT,            VT, Custom);
1000 
1001       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1002         continue;
1003 
1004       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1005       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1006     }
1007 
1008     // Custom lower v2i64 and v2f64 selects.
1009     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1010     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1011     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1012     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1013     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1014 
1015     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1016     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
1017     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1018     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1019     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
1020     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1021 
1022     // Custom legalize these to avoid over promotion or custom promotion.
1023     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1024       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1025       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1026       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1027       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1028     }
1029 
1030     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1031     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Legal);
1032     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1033     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1034 
1035     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1036     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1037 
1038     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1039     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1040 
1041     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1042     setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1043     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1044     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1045     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1046 
1047     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1048     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1049     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1050     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1051 
1052     // We want to legalize this to an f64 load rather than an i64 load on
1053     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1054     // store.
1055     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1056     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1057     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1058     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1059     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1060     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1061 
1062     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1063     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1064     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1065     if (!Subtarget.hasAVX512())
1066       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1067 
1068     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1069     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1070     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1071 
1072     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1073 
1074     setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1075     setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1076     setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1077     setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1078     setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1079     setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1080 
1081     // In the customized shift lowering, the legal v4i32/v2i64 cases
1082     // in AVX2 will be recognized.
1083     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1084       setOperationAction(ISD::SRL,              VT, Custom);
1085       setOperationAction(ISD::SHL,              VT, Custom);
1086       setOperationAction(ISD::SRA,              VT, Custom);
1087     }
1088 
1089     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
1090     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
1091 
1092     // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1093     // shifts) is better.
1094     if (!Subtarget.useAVX512Regs() &&
1095         !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1096       setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
1097 
1098     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1099     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1100     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1101     setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1102     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1103   }
1104 
1105   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1106     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1107     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1108     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1109     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1110     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1111     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1112     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1113     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1114 
1115     // These might be better off as horizontal vector ops.
1116     setOperationAction(ISD::ADD,                MVT::i16, Custom);
1117     setOperationAction(ISD::ADD,                MVT::i32, Custom);
1118     setOperationAction(ISD::SUB,                MVT::i16, Custom);
1119     setOperationAction(ISD::SUB,                MVT::i32, Custom);
1120   }
1121 
1122   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1123     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1124       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1125       setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1126       setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1127       setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1128       setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1129       setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1130       setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1131       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1132       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1133       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1134       setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1135       setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1136 
1137       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1138     }
1139 
1140     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1141     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1142     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1143     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1144     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1145     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1146     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1147     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1148 
1149     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1150 
1151     // FIXME: Do we need to handle scalar-to-vector here?
1152     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1153 
1154     // We directly match byte blends in the backend as they match the VSELECT
1155     // condition form.
1156     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1157 
1158     // SSE41 brings specific instructions for doing vector sign extend even in
1159     // cases where we don't have SRA.
1160     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1161       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1162       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1163     }
1164 
1165     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1166     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1167       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1168       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1169       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1170       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1171       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1172       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1173     }
1174 
1175     // i8 vectors are custom because the source register and source
1176     // source memory operand types are not the same width.
1177     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1178 
1179     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181       // do the pre and post work in the vector domain.
1182       setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1183       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184       // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185       // so that DAG combine doesn't try to turn it into uint_to_fp.
1186       setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1187       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188     }
1189   }
1190 
1191   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1193   }
1194 
1195   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1197                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1198       setOperationAction(ISD::ROTL, VT, Custom);
1199 
1200     // XOP can efficiently perform BITREVERSE with VPPERM.
1201     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1202       setOperationAction(ISD::BITREVERSE, VT, Custom);
1203 
1204     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1205                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1206       setOperationAction(ISD::BITREVERSE, VT, Custom);
1207   }
1208 
1209   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1210     bool HasInt256 = Subtarget.hasInt256();
1211 
1212     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1213                                                      : &X86::VR256RegClass);
1214     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215                                                      : &X86::VR256RegClass);
1216     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1217                                                      : &X86::VR256RegClass);
1218     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1219                                                      : &X86::VR256RegClass);
1220     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1221                                                      : &X86::VR256RegClass);
1222     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1223                                                      : &X86::VR256RegClass);
1224 
1225     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1226       setOperationAction(ISD::FFLOOR,            VT, Legal);
1227       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1228       setOperationAction(ISD::FCEIL,             VT, Legal);
1229       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1230       setOperationAction(ISD::FTRUNC,            VT, Legal);
1231       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1232       setOperationAction(ISD::FRINT,             VT, Legal);
1233       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1234       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1235       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1236       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1237       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1238 
1239       setOperationAction(ISD::FROUND,            VT, Custom);
1240 
1241       setOperationAction(ISD::FNEG,              VT, Custom);
1242       setOperationAction(ISD::FABS,              VT, Custom);
1243       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1244     }
1245 
1246     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1247     // even though v8i16 is a legal type.
1248     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1249     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1250     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
1253     setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
1254     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
1255 
1256     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1257     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Legal);
1258 
1259     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1260     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1261     setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1262     setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1263     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1264     setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1265     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1266     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1267     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1268     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
1269     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1270     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1271 
1272     if (!Subtarget.hasAVX512())
1273       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1274 
1275     // In the customized shift lowering, the legal v8i32/v4i64 cases
1276     // in AVX2 will be recognized.
1277     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1278       setOperationAction(ISD::SRL, VT, Custom);
1279       setOperationAction(ISD::SHL, VT, Custom);
1280       setOperationAction(ISD::SRA, VT, Custom);
1281     }
1282 
1283     // These types need custom splitting if their input is a 128-bit vector.
1284     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1285     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1286     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1287     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1288 
1289     setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
1290     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
1291 
1292     // With BWI, expanding (and promoting the shifts) is the better.
1293     if (!Subtarget.useBWIRegs())
1294       setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
1295 
1296     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1297     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1298     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1299     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1300     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1301     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1302 
1303     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1305       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1306       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1307     }
1308 
1309     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1310     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1311     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1312     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1313 
1314     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315       setOperationAction(ISD::SETCC,           VT, Custom);
1316       setOperationAction(ISD::STRICT_FSETCC,   VT, Custom);
1317       setOperationAction(ISD::STRICT_FSETCCS,  VT, Custom);
1318       setOperationAction(ISD::CTPOP,           VT, Custom);
1319       setOperationAction(ISD::CTLZ,            VT, Custom);
1320 
1321       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322       // setcc all the way to isel and prefer SETGT in some isel patterns.
1323       setCondCodeAction(ISD::SETLT, VT, Custom);
1324       setCondCodeAction(ISD::SETLE, VT, Custom);
1325     }
1326 
1327     if (Subtarget.hasAnyFMA()) {
1328       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329                        MVT::v2f64, MVT::v4f64 }) {
1330         setOperationAction(ISD::FMA, VT, Legal);
1331         setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332       }
1333     }
1334 
1335     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338     }
1339 
1340     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1341     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1342     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1343     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1344 
1345     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1346     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1347     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1348     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1349     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1350     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1351 
1352     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1353     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1354 
1355     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1356     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1357     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1358     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1359     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1360 
1361     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1362     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1363     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1364     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1365     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1366     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1367     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1368     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1369     setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1370     setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1371     setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1372     setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1373 
1374     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1375       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1376       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1377       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1378       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1379       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1380     }
1381 
1382     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1383       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1384       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1385     }
1386 
1387     if (HasInt256) {
1388       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1389       // when we have a 256bit-wide blend with immediate.
1390       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1391       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1392 
1393       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1394       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1395         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1396         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1397         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1398         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1399         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1400         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1401       }
1402     }
1403 
1404     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1405                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1406       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1407       setOperationAction(ISD::MSTORE, VT, Legal);
1408     }
1409 
1410     // Extract subvector is special because the value type
1411     // (result) is 128-bit but the source is 256-bit wide.
1412     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1413                      MVT::v4f32, MVT::v2f64 }) {
1414       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1415     }
1416 
1417     // Custom lower several nodes for 256-bit types.
1418     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1419                     MVT::v8f32, MVT::v4f64 }) {
1420       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1421       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1422       setOperationAction(ISD::VSELECT,            VT, Custom);
1423       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1424       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1425       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1426       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1427       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1428       setOperationAction(ISD::STORE,              VT, Custom);
1429     }
1430 
1431     if (HasInt256) {
1432       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1433 
1434       // Custom legalize 2x32 to get a little better code.
1435       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1436       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1437 
1438       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1439                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1440         setOperationAction(ISD::MGATHER,  VT, Custom);
1441     }
1442   }
1443 
1444   // This block controls legalization of the mask vector sizes that are
1445   // available with AVX512. 512-bit vectors are in a separate block controlled
1446   // by useAVX512Regs.
1447   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1448     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1449     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1450     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1451     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1452     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1453 
1454     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1455     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1456     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1457 
1458     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1459     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1460     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1461     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1462     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1463     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1464     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1465     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1466     setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1467     setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1468     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1469     setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1470 
1471     // There is no byte sized k-register load or store without AVX512DQ.
1472     if (!Subtarget.hasDQI()) {
1473       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1474       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1475       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1476       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1477 
1478       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1479       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1480       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1481       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1482     }
1483 
1484     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1485     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1486       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1487       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1488       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1489     }
1490 
1491     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1492       setOperationAction(ISD::VSELECT,          VT, Expand);
1493 
1494     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1495       setOperationAction(ISD::SETCC,            VT, Custom);
1496       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1497       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1498       setOperationAction(ISD::SELECT,           VT, Custom);
1499       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1500 
1501       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1502       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1503       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1504       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1505       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1506       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1507     }
1508 
1509     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1510       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1511   }
1512 
1513   // This block controls legalization for 512-bit operations with 32/64 bit
1514   // elements. 512-bits can be disabled based on prefer-vector-width and
1515   // required-vector-width function attributes.
1516   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1517     bool HasBWI = Subtarget.hasBWI();
1518 
1519     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1520     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1521     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1522     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1523     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1524     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1525 
1526     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1527       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1528       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1529       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1530       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1531       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1532       if (HasBWI)
1533         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1534     }
1535 
1536     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1537       setOperationAction(ISD::FNEG,  VT, Custom);
1538       setOperationAction(ISD::FABS,  VT, Custom);
1539       setOperationAction(ISD::FMA,   VT, Legal);
1540       setOperationAction(ISD::STRICT_FMA, VT, Legal);
1541       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1542     }
1543 
1544     for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1545       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1546       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1547       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1548       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1549     }
1550     setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Legal);
1551     setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Legal);
1552     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1553     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1554     setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Legal);
1555     setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Legal);
1556     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1557     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1558 
1559     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1560     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1561     setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1562     setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1563     setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1564     setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1565     setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1566     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1567     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1568     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1569     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64,  Legal);
1570     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1571 
1572     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1573     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1574     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1575     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1576     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1577     if (HasBWI)
1578       setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1579 
1580     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1581     // to 512-bit rather than use the AVX2 instructions so that we can use
1582     // k-masks.
1583     if (!Subtarget.hasVLX()) {
1584       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1585            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1586         setOperationAction(ISD::MLOAD,  VT, Custom);
1587         setOperationAction(ISD::MSTORE, VT, Custom);
1588       }
1589     }
1590 
1591     setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1592     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1593     setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1594     setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1595     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1596     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1597     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1598     setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1599     setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1600     setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1601     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1602     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1603     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1604 
1605     if (HasBWI) {
1606       // Extends from v64i1 masks to 512-bit vectors.
1607       setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1608       setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1609       setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1610     }
1611 
1612     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1613       setOperationAction(ISD::FFLOOR,            VT, Legal);
1614       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1615       setOperationAction(ISD::FCEIL,             VT, Legal);
1616       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1617       setOperationAction(ISD::FTRUNC,            VT, Legal);
1618       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1619       setOperationAction(ISD::FRINT,             VT, Legal);
1620       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1621       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1622       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1623       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1624       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1625 
1626       setOperationAction(ISD::FROUND,            VT, Custom);
1627     }
1628 
1629     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1630       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1631       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1632     }
1633 
1634     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1635     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1636     setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1637     setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1638 
1639     setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1640     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1641     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1642     setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1643 
1644     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1645     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1646     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1647     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1648     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1649     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1650 
1651     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1652     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1653 
1654     setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1655 
1656     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1657       setOperationAction(ISD::SRL,              VT, Custom);
1658       setOperationAction(ISD::SHL,              VT, Custom);
1659       setOperationAction(ISD::SRA,              VT, Custom);
1660       setOperationAction(ISD::SETCC,            VT, Custom);
1661 
1662       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1663       // setcc all the way to isel and prefer SETGT in some isel patterns.
1664       setCondCodeAction(ISD::SETLT, VT, Custom);
1665       setCondCodeAction(ISD::SETLE, VT, Custom);
1666     }
1667     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1668       setOperationAction(ISD::SMAX,             VT, Legal);
1669       setOperationAction(ISD::UMAX,             VT, Legal);
1670       setOperationAction(ISD::SMIN,             VT, Legal);
1671       setOperationAction(ISD::UMIN,             VT, Legal);
1672       setOperationAction(ISD::ABS,              VT, Legal);
1673       setOperationAction(ISD::CTPOP,            VT, Custom);
1674       setOperationAction(ISD::ROTL,             VT, Custom);
1675       setOperationAction(ISD::ROTR,             VT, Custom);
1676       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
1677       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
1678     }
1679 
1680     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1681       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1682       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1683       setOperationAction(ISD::CTLZ,    VT, Custom);
1684       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1685       setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1686       setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1687       setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1688       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1689       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1690       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1691       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1692     }
1693 
1694     if (Subtarget.hasDQI()) {
1695       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1696       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1697       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1698       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1699       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1700       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1701       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1702       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1703 
1704       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1705     }
1706 
1707     if (Subtarget.hasCDI()) {
1708       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1709       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1710         setOperationAction(ISD::CTLZ,            VT, Legal);
1711       }
1712     } // Subtarget.hasCDI()
1713 
1714     if (Subtarget.hasVPOPCNTDQ()) {
1715       for (auto VT : { MVT::v16i32, MVT::v8i64 })
1716         setOperationAction(ISD::CTPOP, VT, Legal);
1717     }
1718 
1719     // Extract subvector is special because the value type
1720     // (result) is 256-bit but the source is 512-bit wide.
1721     // 128-bit was made Legal under AVX1.
1722     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1723                      MVT::v8f32, MVT::v4f64 })
1724       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1725 
1726     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1727                      MVT::v16f32, MVT::v8f64 }) {
1728       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1729       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1730       setOperationAction(ISD::SELECT,             VT, Custom);
1731       setOperationAction(ISD::VSELECT,            VT, Custom);
1732       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1733       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1734       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1735       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1736       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1737     }
1738 
1739     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1740       setOperationAction(ISD::MLOAD,               VT, Legal);
1741       setOperationAction(ISD::MSTORE,              VT, Legal);
1742       setOperationAction(ISD::MGATHER,             VT, Custom);
1743       setOperationAction(ISD::MSCATTER,            VT, Custom);
1744     }
1745     if (HasBWI) {
1746       for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1747         setOperationAction(ISD::MLOAD,        VT, Legal);
1748         setOperationAction(ISD::MSTORE,       VT, Legal);
1749       }
1750     } else {
1751       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1752       setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1753     }
1754 
1755     if (Subtarget.hasVBMI2()) {
1756       for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1757                        MVT::v16i16, MVT::v8i32, MVT::v4i64,
1758                        MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1759         setOperationAction(ISD::FSHL, VT, Custom);
1760         setOperationAction(ISD::FSHR, VT, Custom);
1761       }
1762 
1763       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1764       setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1765       setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1766       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1767     }
1768   }// useAVX512Regs
1769 
1770   // This block controls legalization for operations that don't have
1771   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1772   // narrower widths.
1773   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1774     // These operations are handled on non-VLX by artificially widening in
1775     // isel patterns.
1776 
1777     setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1778                        Subtarget.hasVLX() ? Legal : Custom);
1779     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1780                        Subtarget.hasVLX() ? Legal : Custom);
1781     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1782                        Subtarget.hasVLX() ? Legal : Custom);
1783     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1784                        Subtarget.hasVLX() ? Legal : Custom);
1785     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
1786     setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1787                        Subtarget.hasVLX() ? Legal : Custom);
1788     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1789                        Subtarget.hasVLX() ? Legal : Custom);
1790     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1791                        Subtarget.hasVLX() ? Legal : Custom);
1792     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1793                        Subtarget.hasVLX() ? Legal : Custom);
1794 
1795     if (Subtarget.hasDQI()) {
1796       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1797       // v2f32 UINT_TO_FP is already custom under SSE2.
1798       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1799              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1800              "Unexpected operation action!");
1801       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1802       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
1803       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
1804       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1805       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1806     }
1807 
1808     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1809       setOperationAction(ISD::SMAX, VT, Legal);
1810       setOperationAction(ISD::UMAX, VT, Legal);
1811       setOperationAction(ISD::SMIN, VT, Legal);
1812       setOperationAction(ISD::UMIN, VT, Legal);
1813       setOperationAction(ISD::ABS,  VT, Legal);
1814     }
1815 
1816     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1817       setOperationAction(ISD::ROTL,     VT, Custom);
1818       setOperationAction(ISD::ROTR,     VT, Custom);
1819     }
1820 
1821     // Custom legalize 2x32 to get a little better code.
1822     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1823     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1824 
1825     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1826                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1827       setOperationAction(ISD::MSCATTER, VT, Custom);
1828 
1829     if (Subtarget.hasDQI()) {
1830       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1831         setOperationAction(ISD::SINT_TO_FP, VT,
1832                            Subtarget.hasVLX() ? Legal : Custom);
1833         setOperationAction(ISD::UINT_TO_FP, VT,
1834                            Subtarget.hasVLX() ? Legal : Custom);
1835         setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1836                            Subtarget.hasVLX() ? Legal : Custom);
1837         setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1838                            Subtarget.hasVLX() ? Legal : Custom);
1839         setOperationAction(ISD::FP_TO_SINT, VT,
1840                            Subtarget.hasVLX() ? Legal : Custom);
1841         setOperationAction(ISD::FP_TO_UINT, VT,
1842                            Subtarget.hasVLX() ? Legal : Custom);
1843         setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1844                            Subtarget.hasVLX() ? Legal : Custom);
1845         setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1846                            Subtarget.hasVLX() ? Legal : Custom);
1847         setOperationAction(ISD::MUL,               VT, Legal);
1848       }
1849     }
1850 
1851     if (Subtarget.hasCDI()) {
1852       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1853         setOperationAction(ISD::CTLZ,            VT, Legal);
1854       }
1855     } // Subtarget.hasCDI()
1856 
1857     if (Subtarget.hasVPOPCNTDQ()) {
1858       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1859         setOperationAction(ISD::CTPOP, VT, Legal);
1860     }
1861   }
1862 
1863   // This block control legalization of v32i1/v64i1 which are available with
1864   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1865   // useBWIRegs.
1866   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1867     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1868     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1869 
1870     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1871       setOperationAction(ISD::VSELECT,            VT, Expand);
1872       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1873       setOperationAction(ISD::SETCC,              VT, Custom);
1874       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1875       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1876       setOperationAction(ISD::SELECT,             VT, Custom);
1877       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1878       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1879       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1880       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1881     }
1882 
1883     for (auto VT : { MVT::v16i1, MVT::v32i1 })
1884       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1885 
1886     // Extends from v32i1 masks to 256-bit vectors.
1887     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1888     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1889     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
1890 
1891     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1892       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1893       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1894     }
1895 
1896     // These operations are handled on non-VLX by artificially widening in
1897     // isel patterns.
1898     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1899 
1900     if (Subtarget.hasBITALG()) {
1901       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1902         setOperationAction(ISD::CTPOP, VT, Legal);
1903     }
1904   }
1905 
1906   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1907     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1908     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1909     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1910     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1911     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1912 
1913     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1914     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1915     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1916     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1917     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1918 
1919     if (Subtarget.hasBWI()) {
1920       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1921       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1922     }
1923 
1924     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1925     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1926     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1927   }
1928 
1929   if (Subtarget.hasAMXTILE()) {
1930     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1931   }
1932 
1933   // We want to custom lower some of our intrinsics.
1934   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1935   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1936   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1937   if (!Subtarget.is64Bit()) {
1938     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1939   }
1940 
1941   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1942   // handle type legalization for these operations here.
1943   //
1944   // FIXME: We really should do custom legalization for addition and
1945   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1946   // than generic legalization for 64-bit multiplication-with-overflow, though.
1947   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1948     if (VT == MVT::i64 && !Subtarget.is64Bit())
1949       continue;
1950     // Add/Sub/Mul with overflow operations are custom lowered.
1951     setOperationAction(ISD::SADDO, VT, Custom);
1952     setOperationAction(ISD::UADDO, VT, Custom);
1953     setOperationAction(ISD::SSUBO, VT, Custom);
1954     setOperationAction(ISD::USUBO, VT, Custom);
1955     setOperationAction(ISD::SMULO, VT, Custom);
1956     setOperationAction(ISD::UMULO, VT, Custom);
1957 
1958     // Support carry in as value rather than glue.
1959     setOperationAction(ISD::ADDCARRY, VT, Custom);
1960     setOperationAction(ISD::SUBCARRY, VT, Custom);
1961     setOperationAction(ISD::SETCCCARRY, VT, Custom);
1962     setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1963     setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1964   }
1965 
1966   if (!Subtarget.is64Bit()) {
1967     // These libcalls are not available in 32-bit.
1968     setLibcallName(RTLIB::SHL_I128, nullptr);
1969     setLibcallName(RTLIB::SRL_I128, nullptr);
1970     setLibcallName(RTLIB::SRA_I128, nullptr);
1971     setLibcallName(RTLIB::MUL_I128, nullptr);
1972   }
1973 
1974   // Combine sin / cos into _sincos_stret if it is available.
1975   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1976       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1977     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1978     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1979   }
1980 
1981   if (Subtarget.isTargetWin64()) {
1982     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1983     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1984     setOperationAction(ISD::SREM, MVT::i128, Custom);
1985     setOperationAction(ISD::UREM, MVT::i128, Custom);
1986   }
1987 
1988   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1989   // is. We should promote the value to 64-bits to solve this.
1990   // This is what the CRT headers do - `fmodf` is an inline header
1991   // function casting to f64 and calling `fmod`.
1992   if (Subtarget.is32Bit() &&
1993       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1994     for (ISD::NodeType Op :
1995          {ISD::FCEIL,  ISD::STRICT_FCEIL,
1996           ISD::FCOS,   ISD::STRICT_FCOS,
1997           ISD::FEXP,   ISD::STRICT_FEXP,
1998           ISD::FFLOOR, ISD::STRICT_FFLOOR,
1999           ISD::FREM,   ISD::STRICT_FREM,
2000           ISD::FLOG,   ISD::STRICT_FLOG,
2001           ISD::FLOG10, ISD::STRICT_FLOG10,
2002           ISD::FPOW,   ISD::STRICT_FPOW,
2003           ISD::FSIN,   ISD::STRICT_FSIN})
2004       if (isOperationExpand(Op, MVT::f32))
2005         setOperationAction(Op, MVT::f32, Promote);
2006 
2007   // We have target-specific dag combine patterns for the following nodes:
2008   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2009   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2010   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2011   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2012   setTargetDAGCombine(ISD::CONCAT_VECTORS);
2013   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2014   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2015   setTargetDAGCombine(ISD::BITCAST);
2016   setTargetDAGCombine(ISD::VSELECT);
2017   setTargetDAGCombine(ISD::SELECT);
2018   setTargetDAGCombine(ISD::SHL);
2019   setTargetDAGCombine(ISD::SRA);
2020   setTargetDAGCombine(ISD::SRL);
2021   setTargetDAGCombine(ISD::OR);
2022   setTargetDAGCombine(ISD::AND);
2023   setTargetDAGCombine(ISD::ADD);
2024   setTargetDAGCombine(ISD::FADD);
2025   setTargetDAGCombine(ISD::FSUB);
2026   setTargetDAGCombine(ISD::FNEG);
2027   setTargetDAGCombine(ISD::FMA);
2028   setTargetDAGCombine(ISD::STRICT_FMA);
2029   setTargetDAGCombine(ISD::FMINNUM);
2030   setTargetDAGCombine(ISD::FMAXNUM);
2031   setTargetDAGCombine(ISD::SUB);
2032   setTargetDAGCombine(ISD::LOAD);
2033   setTargetDAGCombine(ISD::MLOAD);
2034   setTargetDAGCombine(ISD::STORE);
2035   setTargetDAGCombine(ISD::MSTORE);
2036   setTargetDAGCombine(ISD::TRUNCATE);
2037   setTargetDAGCombine(ISD::ZERO_EXTEND);
2038   setTargetDAGCombine(ISD::ANY_EXTEND);
2039   setTargetDAGCombine(ISD::SIGN_EXTEND);
2040   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2041   setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2042   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2043   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2044   setTargetDAGCombine(ISD::SINT_TO_FP);
2045   setTargetDAGCombine(ISD::UINT_TO_FP);
2046   setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2047   setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2048   setTargetDAGCombine(ISD::SETCC);
2049   setTargetDAGCombine(ISD::MUL);
2050   setTargetDAGCombine(ISD::XOR);
2051   setTargetDAGCombine(ISD::MSCATTER);
2052   setTargetDAGCombine(ISD::MGATHER);
2053   setTargetDAGCombine(ISD::FP16_TO_FP);
2054   setTargetDAGCombine(ISD::FP_EXTEND);
2055   setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2056   setTargetDAGCombine(ISD::FP_ROUND);
2057 
2058   computeRegisterProperties(Subtarget.getRegisterInfo());
2059 
2060   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2061   MaxStoresPerMemsetOptSize = 8;
2062   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2063   MaxStoresPerMemcpyOptSize = 4;
2064   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2065   MaxStoresPerMemmoveOptSize = 4;
2066 
2067   // TODO: These control memcmp expansion in CGP and could be raised higher, but
2068   // that needs to benchmarked and balanced with the potential use of vector
2069   // load/store types (PR33329, PR33914).
2070   MaxLoadsPerMemcmp = 2;
2071   MaxLoadsPerMemcmpOptSize = 2;
2072 
2073   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2074   setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2075 
2076   // An out-of-order CPU can speculatively execute past a predictable branch,
2077   // but a conditional move could be stalled by an expensive earlier operation.
2078   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2079   EnableExtLdPromotion = true;
2080   setPrefFunctionAlignment(Align(16));
2081 
2082   verifyIntrinsicTables();
2083 
2084   // Default to having -disable-strictnode-mutation on
2085   IsStrictFPEnabled = true;
2086 }
2087 
2088 // This has so far only been implemented for 64-bit MachO.
2089 bool X86TargetLowering::useLoadStackGuardNode() const {
2090   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2091 }
2092 
2093 bool X86TargetLowering::useStackGuardXorFP() const {
2094   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2095   return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2096 }
2097 
2098 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2099                                                const SDLoc &DL) const {
2100   EVT PtrTy = getPointerTy(DAG.getDataLayout());
2101   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2102   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2103   return SDValue(Node, 0);
2104 }
2105 
2106 TargetLoweringBase::LegalizeTypeAction
2107 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2108   if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2109       !Subtarget.hasBWI())
2110     return TypeSplitVector;
2111 
2112   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2113       VT.getVectorElementType() != MVT::i1)
2114     return TypeWidenVector;
2115 
2116   return TargetLoweringBase::getPreferredVectorAction(VT);
2117 }
2118 
2119 static std::pair<MVT, unsigned>
2120 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2121                                  const X86Subtarget &Subtarget) {
2122   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2123   // convention is one that uses k registers.
2124   if (NumElts == 2)
2125     return {MVT::v2i64, 1};
2126   if (NumElts == 4)
2127     return {MVT::v4i32, 1};
2128   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2129       CC != CallingConv::Intel_OCL_BI)
2130     return {MVT::v8i16, 1};
2131   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2132       CC != CallingConv::Intel_OCL_BI)
2133     return {MVT::v16i8, 1};
2134   // v32i1 passes in ymm unless we have BWI and the calling convention is
2135   // regcall.
2136   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2137     return {MVT::v32i8, 1};
2138   // Split v64i1 vectors if we don't have v64i8 available.
2139   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2140     if (Subtarget.useAVX512Regs())
2141       return {MVT::v64i8, 1};
2142     return {MVT::v32i8, 2};
2143   }
2144 
2145   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2146   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2147       NumElts > 64)
2148     return {MVT::i8, NumElts};
2149 
2150   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2151 }
2152 
2153 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2154                                                      CallingConv::ID CC,
2155                                                      EVT VT) const {
2156   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2157       Subtarget.hasAVX512()) {
2158     unsigned NumElts = VT.getVectorNumElements();
2159 
2160     MVT RegisterVT;
2161     unsigned NumRegisters;
2162     std::tie(RegisterVT, NumRegisters) =
2163         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2164     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2165       return RegisterVT;
2166   }
2167 
2168   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2169 }
2170 
2171 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2172                                                           CallingConv::ID CC,
2173                                                           EVT VT) const {
2174   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2175       Subtarget.hasAVX512()) {
2176     unsigned NumElts = VT.getVectorNumElements();
2177 
2178     MVT RegisterVT;
2179     unsigned NumRegisters;
2180     std::tie(RegisterVT, NumRegisters) =
2181         handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2182     if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2183       return NumRegisters;
2184   }
2185 
2186   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2187 }
2188 
2189 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2190     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2191     unsigned &NumIntermediates, MVT &RegisterVT) const {
2192   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2193   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2194       Subtarget.hasAVX512() &&
2195       (!isPowerOf2_32(VT.getVectorNumElements()) ||
2196        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2197        VT.getVectorNumElements() > 64)) {
2198     RegisterVT = MVT::i8;
2199     IntermediateVT = MVT::i1;
2200     NumIntermediates = VT.getVectorNumElements();
2201     return NumIntermediates;
2202   }
2203 
2204   // Split v64i1 vectors if we don't have v64i8 available.
2205   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2206       CC != CallingConv::X86_RegCall) {
2207     RegisterVT = MVT::v32i8;
2208     IntermediateVT = MVT::v32i1;
2209     NumIntermediates = 2;
2210     return 2;
2211   }
2212 
2213   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2214                                               NumIntermediates, RegisterVT);
2215 }
2216 
2217 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2218                                           LLVMContext& Context,
2219                                           EVT VT) const {
2220   if (!VT.isVector())
2221     return MVT::i8;
2222 
2223   if (Subtarget.hasAVX512()) {
2224     // Figure out what this type will be legalized to.
2225     EVT LegalVT = VT;
2226     while (getTypeAction(Context, LegalVT) != TypeLegal)
2227       LegalVT = getTypeToTransformTo(Context, LegalVT);
2228 
2229     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2230     if (LegalVT.getSimpleVT().is512BitVector())
2231       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2232 
2233     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2234       // If we legalized to less than a 512-bit vector, then we will use a vXi1
2235       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2236       // vXi16/vXi8.
2237       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2238       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2239         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2240     }
2241   }
2242 
2243   return VT.changeVectorElementTypeToInteger();
2244 }
2245 
2246 /// Helper for getByValTypeAlignment to determine
2247 /// the desired ByVal argument alignment.
2248 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2249   if (MaxAlign == 16)
2250     return;
2251   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2252     if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2253       MaxAlign = Align(16);
2254   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2255     Align EltAlign;
2256     getMaxByValAlign(ATy->getElementType(), EltAlign);
2257     if (EltAlign > MaxAlign)
2258       MaxAlign = EltAlign;
2259   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2260     for (auto *EltTy : STy->elements()) {
2261       Align EltAlign;
2262       getMaxByValAlign(EltTy, EltAlign);
2263       if (EltAlign > MaxAlign)
2264         MaxAlign = EltAlign;
2265       if (MaxAlign == 16)
2266         break;
2267     }
2268   }
2269 }
2270 
2271 /// Return the desired alignment for ByVal aggregate
2272 /// function arguments in the caller parameter area. For X86, aggregates
2273 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2274 /// are at 4-byte boundaries.
2275 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2276                                                   const DataLayout &DL) const {
2277   if (Subtarget.is64Bit()) {
2278     // Max of 8 and alignment of type.
2279     Align TyAlign = DL.getABITypeAlign(Ty);
2280     if (TyAlign > 8)
2281       return TyAlign.value();
2282     return 8;
2283   }
2284 
2285   Align Alignment(4);
2286   if (Subtarget.hasSSE1())
2287     getMaxByValAlign(Ty, Alignment);
2288   return Alignment.value();
2289 }
2290 
2291 /// It returns EVT::Other if the type should be determined using generic
2292 /// target-independent logic.
2293 /// For vector ops we check that the overall size isn't larger than our
2294 /// preferred vector width.
2295 EVT X86TargetLowering::getOptimalMemOpType(
2296     const MemOp &Op, const AttributeList &FuncAttributes) const {
2297   if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2298     if (Op.size() >= 16 &&
2299         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2300       // FIXME: Check if unaligned 64-byte accesses are slow.
2301       if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2302           (Subtarget.getPreferVectorWidth() >= 512)) {
2303         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2304       }
2305       // FIXME: Check if unaligned 32-byte accesses are slow.
2306       if (Op.size() >= 32 && Subtarget.hasAVX() &&
2307           (Subtarget.getPreferVectorWidth() >= 256)) {
2308         // Although this isn't a well-supported type for AVX1, we'll let
2309         // legalization and shuffle lowering produce the optimal codegen. If we
2310         // choose an optimal type with a vector element larger than a byte,
2311         // getMemsetStores() may create an intermediate splat (using an integer
2312         // multiply) before we splat as a vector.
2313         return MVT::v32i8;
2314       }
2315       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2316         return MVT::v16i8;
2317       // TODO: Can SSE1 handle a byte vector?
2318       // If we have SSE1 registers we should be able to use them.
2319       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2320           (Subtarget.getPreferVectorWidth() >= 128))
2321         return MVT::v4f32;
2322     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2323                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2324       // Do not use f64 to lower memcpy if source is string constant. It's
2325       // better to use i32 to avoid the loads.
2326       // Also, do not use f64 to lower memset unless this is a memset of zeros.
2327       // The gymnastics of splatting a byte value into an XMM register and then
2328       // only using 8-byte stores (because this is a CPU with slow unaligned
2329       // 16-byte accesses) makes that a loser.
2330       return MVT::f64;
2331     }
2332   }
2333   // This is a compromise. If we reach here, unaligned accesses may be slow on
2334   // this target. However, creating smaller, aligned accesses could be even
2335   // slower and would certainly be a lot more code.
2336   if (Subtarget.is64Bit() && Op.size() >= 8)
2337     return MVT::i64;
2338   return MVT::i32;
2339 }
2340 
2341 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2342   if (VT == MVT::f32)
2343     return X86ScalarSSEf32;
2344   if (VT == MVT::f64)
2345     return X86ScalarSSEf64;
2346   return true;
2347 }
2348 
2349 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2350     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2351     bool *Fast) const {
2352   if (Fast) {
2353     switch (VT.getSizeInBits()) {
2354     default:
2355       // 8-byte and under are always assumed to be fast.
2356       *Fast = true;
2357       break;
2358     case 128:
2359       *Fast = !Subtarget.isUnalignedMem16Slow();
2360       break;
2361     case 256:
2362       *Fast = !Subtarget.isUnalignedMem32Slow();
2363       break;
2364     // TODO: What about AVX-512 (512-bit) accesses?
2365     }
2366   }
2367   // NonTemporal vector memory ops must be aligned.
2368   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2369     // NT loads can only be vector aligned, so if its less aligned than the
2370     // minimum vector size (which we can split the vector down to), we might as
2371     // well use a regular unaligned vector load.
2372     // We don't have any NT loads pre-SSE41.
2373     if (!!(Flags & MachineMemOperand::MOLoad))
2374       return (Alignment < 16 || !Subtarget.hasSSE41());
2375     return false;
2376   }
2377   // Misaligned accesses of any size are always allowed.
2378   return true;
2379 }
2380 
2381 /// Return the entry encoding for a jump table in the
2382 /// current function.  The returned value is a member of the
2383 /// MachineJumpTableInfo::JTEntryKind enum.
2384 unsigned X86TargetLowering::getJumpTableEncoding() const {
2385   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2386   // symbol.
2387   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2388     return MachineJumpTableInfo::EK_Custom32;
2389 
2390   // Otherwise, use the normal jump table encoding heuristics.
2391   return TargetLowering::getJumpTableEncoding();
2392 }
2393 
2394 bool X86TargetLowering::useSoftFloat() const {
2395   return Subtarget.useSoftFloat();
2396 }
2397 
2398 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2399                                               ArgListTy &Args) const {
2400 
2401   // Only relabel X86-32 for C / Stdcall CCs.
2402   if (Subtarget.is64Bit())
2403     return;
2404   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2405     return;
2406   unsigned ParamRegs = 0;
2407   if (auto *M = MF->getFunction().getParent())
2408     ParamRegs = M->getNumberRegisterParameters();
2409 
2410   // Mark the first N int arguments as having reg
2411   for (auto &Arg : Args) {
2412     Type *T = Arg.Ty;
2413     if (T->isIntOrPtrTy())
2414       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2415         unsigned numRegs = 1;
2416         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2417           numRegs = 2;
2418         if (ParamRegs < numRegs)
2419           return;
2420         ParamRegs -= numRegs;
2421         Arg.IsInReg = true;
2422       }
2423   }
2424 }
2425 
2426 const MCExpr *
2427 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2428                                              const MachineBasicBlock *MBB,
2429                                              unsigned uid,MCContext &Ctx) const{
2430   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2431   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2432   // entries.
2433   return MCSymbolRefExpr::create(MBB->getSymbol(),
2434                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
2435 }
2436 
2437 /// Returns relocation base for the given PIC jumptable.
2438 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2439                                                     SelectionDAG &DAG) const {
2440   if (!Subtarget.is64Bit())
2441     // This doesn't have SDLoc associated with it, but is not really the
2442     // same as a Register.
2443     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2444                        getPointerTy(DAG.getDataLayout()));
2445   return Table;
2446 }
2447 
2448 /// This returns the relocation base for the given PIC jumptable,
2449 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2450 const MCExpr *X86TargetLowering::
2451 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2452                              MCContext &Ctx) const {
2453   // X86-64 uses RIP relative addressing based on the jump table label.
2454   if (Subtarget.isPICStyleRIPRel())
2455     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2456 
2457   // Otherwise, the reference is relative to the PIC base.
2458   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2459 }
2460 
2461 std::pair<const TargetRegisterClass *, uint8_t>
2462 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2463                                            MVT VT) const {
2464   const TargetRegisterClass *RRC = nullptr;
2465   uint8_t Cost = 1;
2466   switch (VT.SimpleTy) {
2467   default:
2468     return TargetLowering::findRepresentativeClass(TRI, VT);
2469   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2470     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2471     break;
2472   case MVT::x86mmx:
2473     RRC = &X86::VR64RegClass;
2474     break;
2475   case MVT::f32: case MVT::f64:
2476   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2477   case MVT::v4f32: case MVT::v2f64:
2478   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2479   case MVT::v8f32: case MVT::v4f64:
2480   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2481   case MVT::v16f32: case MVT::v8f64:
2482     RRC = &X86::VR128XRegClass;
2483     break;
2484   }
2485   return std::make_pair(RRC, Cost);
2486 }
2487 
2488 unsigned X86TargetLowering::getAddressSpace() const {
2489   if (Subtarget.is64Bit())
2490     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2491   return 256;
2492 }
2493 
2494 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2495   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2496          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2497 }
2498 
2499 static Constant* SegmentOffset(IRBuilderBase &IRB,
2500                                int Offset, unsigned AddressSpace) {
2501   return ConstantExpr::getIntToPtr(
2502       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2503       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2504 }
2505 
2506 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2507   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2508   // tcbhead_t; use it instead of the usual global variable (see
2509   // sysdeps/{i386,x86_64}/nptl/tls.h)
2510   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2511     if (Subtarget.isTargetFuchsia()) {
2512       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2513       return SegmentOffset(IRB, 0x10, getAddressSpace());
2514     } else {
2515       unsigned AddressSpace = getAddressSpace();
2516       Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2517       // Specially, some users may customize the base reg and offset.
2518       int Offset = M->getStackProtectorGuardOffset();
2519       // If we don't set -stack-protector-guard-offset value:
2520       // %fs:0x28, unless we're using a Kernel code model, in which case
2521       // it's %gs:0x28.  gs:0x14 on i386.
2522       if (Offset == INT_MAX)
2523         Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2524 
2525       StringRef GuardReg = M->getStackProtectorGuardReg();
2526       if (GuardReg == "fs")
2527         AddressSpace = X86AS::FS;
2528       else if (GuardReg == "gs")
2529         AddressSpace = X86AS::GS;
2530       return SegmentOffset(IRB, Offset, AddressSpace);
2531     }
2532   }
2533   return TargetLowering::getIRStackGuard(IRB);
2534 }
2535 
2536 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2537   // MSVC CRT provides functionalities for stack protection.
2538   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2539       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2540     // MSVC CRT has a global variable holding security cookie.
2541     M.getOrInsertGlobal("__security_cookie",
2542                         Type::getInt8PtrTy(M.getContext()));
2543 
2544     // MSVC CRT has a function to validate security cookie.
2545     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2546         "__security_check_cookie", Type::getVoidTy(M.getContext()),
2547         Type::getInt8PtrTy(M.getContext()));
2548     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2549       F->setCallingConv(CallingConv::X86_FastCall);
2550       F->addAttribute(1, Attribute::AttrKind::InReg);
2551     }
2552     return;
2553   }
2554 
2555   StringRef GuardMode = M.getStackProtectorGuard();
2556 
2557   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2558   if ((GuardMode == "tls" || GuardMode.empty()) &&
2559       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2560     return;
2561   TargetLowering::insertSSPDeclarations(M);
2562 }
2563 
2564 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2565   // MSVC CRT has a global variable holding security cookie.
2566   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2567       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2568     return M.getGlobalVariable("__security_cookie");
2569   }
2570   return TargetLowering::getSDagStackGuard(M);
2571 }
2572 
2573 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2574   // MSVC CRT has a function to validate security cookie.
2575   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2576       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2577     return M.getFunction("__security_check_cookie");
2578   }
2579   return TargetLowering::getSSPStackGuardCheck(M);
2580 }
2581 
2582 Value *
2583 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2584   if (Subtarget.getTargetTriple().isOSContiki())
2585     return getDefaultSafeStackPointerLocation(IRB, false);
2586 
2587   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2588   // definition of TLS_SLOT_SAFESTACK in
2589   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2590   if (Subtarget.isTargetAndroid()) {
2591     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2592     // %gs:0x24 on i386
2593     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2594     return SegmentOffset(IRB, Offset, getAddressSpace());
2595   }
2596 
2597   // Fuchsia is similar.
2598   if (Subtarget.isTargetFuchsia()) {
2599     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2600     return SegmentOffset(IRB, 0x18, getAddressSpace());
2601   }
2602 
2603   return TargetLowering::getSafeStackPointerLocation(IRB);
2604 }
2605 
2606 //===----------------------------------------------------------------------===//
2607 //               Return Value Calling Convention Implementation
2608 //===----------------------------------------------------------------------===//
2609 
2610 bool X86TargetLowering::CanLowerReturn(
2611     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2612     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2613   SmallVector<CCValAssign, 16> RVLocs;
2614   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2615   return CCInfo.CheckReturn(Outs, RetCC_X86);
2616 }
2617 
2618 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2619   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2620   return ScratchRegs;
2621 }
2622 
2623 /// Lowers masks values (v*i1) to the local register values
2624 /// \returns DAG node after lowering to register type
2625 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2626                                const SDLoc &Dl, SelectionDAG &DAG) {
2627   EVT ValVT = ValArg.getValueType();
2628 
2629   if (ValVT == MVT::v1i1)
2630     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2631                        DAG.getIntPtrConstant(0, Dl));
2632 
2633   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2634       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2635     // Two stage lowering might be required
2636     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2637     // anyextend: i8   -> i32 / i16   -> i32
2638     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2639     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2640     if (ValLoc == MVT::i32)
2641       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2642     return ValToCopy;
2643   }
2644 
2645   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2646       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2647     // One stage lowering is required
2648     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2649     return DAG.getBitcast(ValLoc, ValArg);
2650   }
2651 
2652   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2653 }
2654 
2655 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2656 static void Passv64i1ArgInRegs(
2657     const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2658     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2659     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2660   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2661   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2662   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2663   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2664          "The value should reside in two registers");
2665 
2666   // Before splitting the value we cast it to i64
2667   Arg = DAG.getBitcast(MVT::i64, Arg);
2668 
2669   // Splitting the value into two i32 types
2670   SDValue Lo, Hi;
2671   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2672                    DAG.getConstant(0, Dl, MVT::i32));
2673   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2674                    DAG.getConstant(1, Dl, MVT::i32));
2675 
2676   // Attach the two i32 types into corresponding registers
2677   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2678   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2679 }
2680 
2681 SDValue
2682 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2683                                bool isVarArg,
2684                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2685                                const SmallVectorImpl<SDValue> &OutVals,
2686                                const SDLoc &dl, SelectionDAG &DAG) const {
2687   MachineFunction &MF = DAG.getMachineFunction();
2688   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2689 
2690   // In some cases we need to disable registers from the default CSR list.
2691   // For example, when they are used for argument passing.
2692   bool ShouldDisableCalleeSavedRegister =
2693       CallConv == CallingConv::X86_RegCall ||
2694       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2695 
2696   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2697     report_fatal_error("X86 interrupts may not return any value");
2698 
2699   SmallVector<CCValAssign, 16> RVLocs;
2700   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2701   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2702 
2703   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2704   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2705        ++I, ++OutsIndex) {
2706     CCValAssign &VA = RVLocs[I];
2707     assert(VA.isRegLoc() && "Can only return in registers!");
2708 
2709     // Add the register to the CalleeSaveDisableRegs list.
2710     if (ShouldDisableCalleeSavedRegister)
2711       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2712 
2713     SDValue ValToCopy = OutVals[OutsIndex];
2714     EVT ValVT = ValToCopy.getValueType();
2715 
2716     // Promote values to the appropriate types.
2717     if (VA.getLocInfo() == CCValAssign::SExt)
2718       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2719     else if (VA.getLocInfo() == CCValAssign::ZExt)
2720       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2721     else if (VA.getLocInfo() == CCValAssign::AExt) {
2722       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2723         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2724       else
2725         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2726     }
2727     else if (VA.getLocInfo() == CCValAssign::BCvt)
2728       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2729 
2730     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2731            "Unexpected FP-extend for return value.");
2732 
2733     // Report an error if we have attempted to return a value via an XMM
2734     // register and SSE was disabled.
2735     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2736       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2737       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2738     } else if (!Subtarget.hasSSE2() &&
2739                X86::FR64XRegClass.contains(VA.getLocReg()) &&
2740                ValVT == MVT::f64) {
2741       // When returning a double via an XMM register, report an error if SSE2 is
2742       // not enabled.
2743       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2744       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2745     }
2746 
2747     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2748     // the RET instruction and handled by the FP Stackifier.
2749     if (VA.getLocReg() == X86::FP0 ||
2750         VA.getLocReg() == X86::FP1) {
2751       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2752       // change the value to the FP stack register class.
2753       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2754         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2755       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2756       // Don't emit a copytoreg.
2757       continue;
2758     }
2759 
2760     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2761     // which is returned in RAX / RDX.
2762     if (Subtarget.is64Bit()) {
2763       if (ValVT == MVT::x86mmx) {
2764         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2765           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2766           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2767                                   ValToCopy);
2768           // If we don't have SSE2 available, convert to v4f32 so the generated
2769           // register is legal.
2770           if (!Subtarget.hasSSE2())
2771             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2772         }
2773       }
2774     }
2775 
2776     if (VA.needsCustom()) {
2777       assert(VA.getValVT() == MVT::v64i1 &&
2778              "Currently the only custom case is when we split v64i1 to 2 regs");
2779 
2780       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2781                          Subtarget);
2782 
2783       // Add the second register to the CalleeSaveDisableRegs list.
2784       if (ShouldDisableCalleeSavedRegister)
2785         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2786     } else {
2787       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2788     }
2789   }
2790 
2791   SDValue Flag;
2792   SmallVector<SDValue, 6> RetOps;
2793   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2794   // Operand #1 = Bytes To Pop
2795   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2796                    MVT::i32));
2797 
2798   // Copy the result values into the output registers.
2799   for (auto &RetVal : RetVals) {
2800     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2801       RetOps.push_back(RetVal.second);
2802       continue; // Don't emit a copytoreg.
2803     }
2804 
2805     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2806     Flag = Chain.getValue(1);
2807     RetOps.push_back(
2808         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2809   }
2810 
2811   // Swift calling convention does not require we copy the sret argument
2812   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2813 
2814   // All x86 ABIs require that for returning structs by value we copy
2815   // the sret argument into %rax/%eax (depending on ABI) for the return.
2816   // We saved the argument into a virtual register in the entry block,
2817   // so now we copy the value out and into %rax/%eax.
2818   //
2819   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2820   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2821   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2822   // either case FuncInfo->setSRetReturnReg() will have been called.
2823   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2824     // When we have both sret and another return value, we should use the
2825     // original Chain stored in RetOps[0], instead of the current Chain updated
2826     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2827 
2828     // For the case of sret and another return value, we have
2829     //   Chain_0 at the function entry
2830     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2831     // If we use Chain_1 in getCopyFromReg, we will have
2832     //   Val = getCopyFromReg(Chain_1)
2833     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2834 
2835     // getCopyToReg(Chain_0) will be glued together with
2836     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2837     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2838     //   Data dependency from Unit B to Unit A due to usage of Val in
2839     //     getCopyToReg(Chain_1, Val)
2840     //   Chain dependency from Unit A to Unit B
2841 
2842     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2843     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2844                                      getPointerTy(MF.getDataLayout()));
2845 
2846     Register RetValReg
2847         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2848           X86::RAX : X86::EAX;
2849     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2850     Flag = Chain.getValue(1);
2851 
2852     // RAX/EAX now acts like a return value.
2853     RetOps.push_back(
2854         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2855 
2856     // Add the returned register to the CalleeSaveDisableRegs list.
2857     if (ShouldDisableCalleeSavedRegister)
2858       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2859   }
2860 
2861   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2862   const MCPhysReg *I =
2863       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2864   if (I) {
2865     for (; *I; ++I) {
2866       if (X86::GR64RegClass.contains(*I))
2867         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2868       else
2869         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2870     }
2871   }
2872 
2873   RetOps[0] = Chain;  // Update chain.
2874 
2875   // Add the flag if we have it.
2876   if (Flag.getNode())
2877     RetOps.push_back(Flag);
2878 
2879   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2880   if (CallConv == CallingConv::X86_INTR)
2881     opcode = X86ISD::IRET;
2882   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2883 }
2884 
2885 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2886   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2887     return false;
2888 
2889   SDValue TCChain = Chain;
2890   SDNode *Copy = *N->use_begin();
2891   if (Copy->getOpcode() == ISD::CopyToReg) {
2892     // If the copy has a glue operand, we conservatively assume it isn't safe to
2893     // perform a tail call.
2894     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2895       return false;
2896     TCChain = Copy->getOperand(0);
2897   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2898     return false;
2899 
2900   bool HasRet = false;
2901   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2902        UI != UE; ++UI) {
2903     if (UI->getOpcode() != X86ISD::RET_FLAG)
2904       return false;
2905     // If we are returning more than one value, we can definitely
2906     // not make a tail call see PR19530
2907     if (UI->getNumOperands() > 4)
2908       return false;
2909     if (UI->getNumOperands() == 4 &&
2910         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2911       return false;
2912     HasRet = true;
2913   }
2914 
2915   if (!HasRet)
2916     return false;
2917 
2918   Chain = TCChain;
2919   return true;
2920 }
2921 
2922 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2923                                            ISD::NodeType ExtendKind) const {
2924   MVT ReturnMVT = MVT::i32;
2925 
2926   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2927   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2928     // The ABI does not require i1, i8 or i16 to be extended.
2929     //
2930     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2931     // always extending i8/i16 return values, so keep doing that for now.
2932     // (PR26665).
2933     ReturnMVT = MVT::i8;
2934   }
2935 
2936   EVT MinVT = getRegisterType(Context, ReturnMVT);
2937   return VT.bitsLT(MinVT) ? MinVT : VT;
2938 }
2939 
2940 /// Reads two 32 bit registers and creates a 64 bit mask value.
2941 /// \param VA The current 32 bit value that need to be assigned.
2942 /// \param NextVA The next 32 bit value that need to be assigned.
2943 /// \param Root The parent DAG node.
2944 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2945 ///                        glue purposes. In the case the DAG is already using
2946 ///                        physical register instead of virtual, we should glue
2947 ///                        our new SDValue to InFlag SDvalue.
2948 /// \return a new SDvalue of size 64bit.
2949 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2950                                 SDValue &Root, SelectionDAG &DAG,
2951                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
2952                                 SDValue *InFlag = nullptr) {
2953   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2954   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2955   assert(VA.getValVT() == MVT::v64i1 &&
2956          "Expecting first location of 64 bit width type");
2957   assert(NextVA.getValVT() == VA.getValVT() &&
2958          "The locations should have the same type");
2959   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2960          "The values should reside in two registers");
2961 
2962   SDValue Lo, Hi;
2963   SDValue ArgValueLo, ArgValueHi;
2964 
2965   MachineFunction &MF = DAG.getMachineFunction();
2966   const TargetRegisterClass *RC = &X86::GR32RegClass;
2967 
2968   // Read a 32 bit value from the registers.
2969   if (nullptr == InFlag) {
2970     // When no physical register is present,
2971     // create an intermediate virtual register.
2972     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2973     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2974     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2975     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2976   } else {
2977     // When a physical register is available read the value from it and glue
2978     // the reads together.
2979     ArgValueLo =
2980       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2981     *InFlag = ArgValueLo.getValue(2);
2982     ArgValueHi =
2983       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2984     *InFlag = ArgValueHi.getValue(2);
2985   }
2986 
2987   // Convert the i32 type into v32i1 type.
2988   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2989 
2990   // Convert the i32 type into v32i1 type.
2991   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2992 
2993   // Concatenate the two values together.
2994   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2995 }
2996 
2997 /// The function will lower a register of various sizes (8/16/32/64)
2998 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2999 /// \returns a DAG node contains the operand after lowering to mask type.
3000 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3001                                const EVT &ValLoc, const SDLoc &Dl,
3002                                SelectionDAG &DAG) {
3003   SDValue ValReturned = ValArg;
3004 
3005   if (ValVT == MVT::v1i1)
3006     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3007 
3008   if (ValVT == MVT::v64i1) {
3009     // In 32 bit machine, this case is handled by getv64i1Argument
3010     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
3011     // In 64 bit machine, There is no need to truncate the value only bitcast
3012   } else {
3013     MVT maskLen;
3014     switch (ValVT.getSimpleVT().SimpleTy) {
3015     case MVT::v8i1:
3016       maskLen = MVT::i8;
3017       break;
3018     case MVT::v16i1:
3019       maskLen = MVT::i16;
3020       break;
3021     case MVT::v32i1:
3022       maskLen = MVT::i32;
3023       break;
3024     default:
3025       llvm_unreachable("Expecting a vector of i1 types");
3026     }
3027 
3028     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3029   }
3030   return DAG.getBitcast(ValVT, ValReturned);
3031 }
3032 
3033 /// Lower the result values of a call into the
3034 /// appropriate copies out of appropriate physical registers.
3035 ///
3036 SDValue X86TargetLowering::LowerCallResult(
3037     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3038     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3040     uint32_t *RegMask) const {
3041 
3042   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3043   // Assign locations to each value returned by this call.
3044   SmallVector<CCValAssign, 16> RVLocs;
3045   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3046                  *DAG.getContext());
3047   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3048 
3049   // Copy all of the result registers out of their specified physreg.
3050   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3051        ++I, ++InsIndex) {
3052     CCValAssign &VA = RVLocs[I];
3053     EVT CopyVT = VA.getLocVT();
3054 
3055     // In some calling conventions we need to remove the used registers
3056     // from the register mask.
3057     if (RegMask) {
3058       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3059            SubRegs.isValid(); ++SubRegs)
3060         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3061     }
3062 
3063     // Report an error if there was an attempt to return FP values via XMM
3064     // registers.
3065     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3066       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3067       if (VA.getLocReg() == X86::XMM1)
3068         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3069       else
3070         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3071     } else if (!Subtarget.hasSSE2() &&
3072                X86::FR64XRegClass.contains(VA.getLocReg()) &&
3073                CopyVT == MVT::f64) {
3074       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3075       if (VA.getLocReg() == X86::XMM1)
3076         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3077       else
3078         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3079     }
3080 
3081     // If we prefer to use the value in xmm registers, copy it out as f80 and
3082     // use a truncate to move it from fp stack reg to xmm reg.
3083     bool RoundAfterCopy = false;
3084     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3085         isScalarFPTypeInSSEReg(VA.getValVT())) {
3086       if (!Subtarget.hasX87())
3087         report_fatal_error("X87 register return with X87 disabled");
3088       CopyVT = MVT::f80;
3089       RoundAfterCopy = (CopyVT != VA.getLocVT());
3090     }
3091 
3092     SDValue Val;
3093     if (VA.needsCustom()) {
3094       assert(VA.getValVT() == MVT::v64i1 &&
3095              "Currently the only custom case is when we split v64i1 to 2 regs");
3096       Val =
3097           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3098     } else {
3099       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3100                   .getValue(1);
3101       Val = Chain.getValue(0);
3102       InFlag = Chain.getValue(2);
3103     }
3104 
3105     if (RoundAfterCopy)
3106       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3107                         // This truncation won't change the value.
3108                         DAG.getIntPtrConstant(1, dl));
3109 
3110     if (VA.isExtInLoc()) {
3111       if (VA.getValVT().isVector() &&
3112           VA.getValVT().getScalarType() == MVT::i1 &&
3113           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3114            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3115         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3116         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3117       } else
3118         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3119     }
3120 
3121     if (VA.getLocInfo() == CCValAssign::BCvt)
3122       Val = DAG.getBitcast(VA.getValVT(), Val);
3123 
3124     InVals.push_back(Val);
3125   }
3126 
3127   return Chain;
3128 }
3129 
3130 //===----------------------------------------------------------------------===//
3131 //                C & StdCall & Fast Calling Convention implementation
3132 //===----------------------------------------------------------------------===//
3133 //  StdCall calling convention seems to be standard for many Windows' API
3134 //  routines and around. It differs from C calling convention just a little:
3135 //  callee should clean up the stack, not caller. Symbols should be also
3136 //  decorated in some fancy way :) It doesn't support any vector arguments.
3137 //  For info on fast calling convention see Fast Calling Convention (tail call)
3138 //  implementation LowerX86_32FastCCCallTo.
3139 
3140 /// CallIsStructReturn - Determines whether a call uses struct return
3141 /// semantics.
3142 enum StructReturnType {
3143   NotStructReturn,
3144   RegStructReturn,
3145   StackStructReturn
3146 };
3147 static StructReturnType
3148 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3149   if (Outs.empty())
3150     return NotStructReturn;
3151 
3152   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3153   if (!Flags.isSRet())
3154     return NotStructReturn;
3155   if (Flags.isInReg() || IsMCU)
3156     return RegStructReturn;
3157   return StackStructReturn;
3158 }
3159 
3160 /// Determines whether a function uses struct return semantics.
3161 static StructReturnType
3162 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3163   if (Ins.empty())
3164     return NotStructReturn;
3165 
3166   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3167   if (!Flags.isSRet())
3168     return NotStructReturn;
3169   if (Flags.isInReg() || IsMCU)
3170     return RegStructReturn;
3171   return StackStructReturn;
3172 }
3173 
3174 /// Make a copy of an aggregate at address specified by "Src" to address
3175 /// "Dst" with size and alignment information specified by the specific
3176 /// parameter attribute. The copy will be passed as a byval function parameter.
3177 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3178                                          SDValue Chain, ISD::ArgFlagsTy Flags,
3179                                          SelectionDAG &DAG, const SDLoc &dl) {
3180   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3181 
3182   return DAG.getMemcpy(
3183       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3184       /*isVolatile*/ false, /*AlwaysInline=*/true,
3185       /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3186 }
3187 
3188 /// Return true if the calling convention is one that we can guarantee TCO for.
3189 static bool canGuaranteeTCO(CallingConv::ID CC) {
3190   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3191           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3192           CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3193           CC == CallingConv::SwiftTail);
3194 }
3195 
3196 /// Return true if we might ever do TCO for calls with this calling convention.
3197 static bool mayTailCallThisCC(CallingConv::ID CC) {
3198   switch (CC) {
3199   // C calling conventions:
3200   case CallingConv::C:
3201   case CallingConv::Win64:
3202   case CallingConv::X86_64_SysV:
3203   // Callee pop conventions:
3204   case CallingConv::X86_ThisCall:
3205   case CallingConv::X86_StdCall:
3206   case CallingConv::X86_VectorCall:
3207   case CallingConv::X86_FastCall:
3208   // Swift:
3209   case CallingConv::Swift:
3210     return true;
3211   default:
3212     return canGuaranteeTCO(CC);
3213   }
3214 }
3215 
3216 /// Return true if the function is being made into a tailcall target by
3217 /// changing its ABI.
3218 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3219   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3220          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3221 }
3222 
3223 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3224   if (!CI->isTailCall())
3225     return false;
3226 
3227   CallingConv::ID CalleeCC = CI->getCallingConv();
3228   if (!mayTailCallThisCC(CalleeCC))
3229     return false;
3230 
3231   return true;
3232 }
3233 
3234 SDValue
3235 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3236                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3237                                     const SDLoc &dl, SelectionDAG &DAG,
3238                                     const CCValAssign &VA,
3239                                     MachineFrameInfo &MFI, unsigned i) const {
3240   // Create the nodes corresponding to a load from this parameter slot.
3241   ISD::ArgFlagsTy Flags = Ins[i].Flags;
3242   bool AlwaysUseMutable = shouldGuaranteeTCO(
3243       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3244   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3245   EVT ValVT;
3246   MVT PtrVT = getPointerTy(DAG.getDataLayout());
3247 
3248   // If value is passed by pointer we have address passed instead of the value
3249   // itself. No need to extend if the mask value and location share the same
3250   // absolute size.
3251   bool ExtendedInMem =
3252       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3253       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3254 
3255   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3256     ValVT = VA.getLocVT();
3257   else
3258     ValVT = VA.getValVT();
3259 
3260   // FIXME: For now, all byval parameter objects are marked mutable. This can be
3261   // changed with more analysis.
3262   // In case of tail call optimization mark all arguments mutable. Since they
3263   // could be overwritten by lowering of arguments in case of a tail call.
3264   if (Flags.isByVal()) {
3265     unsigned Bytes = Flags.getByValSize();
3266     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3267 
3268     // FIXME: For now, all byval parameter objects are marked as aliasing. This
3269     // can be improved with deeper analysis.
3270     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3271                                    /*isAliased=*/true);
3272     return DAG.getFrameIndex(FI, PtrVT);
3273   }
3274 
3275   EVT ArgVT = Ins[i].ArgVT;
3276 
3277   // If this is a vector that has been split into multiple parts, and the
3278   // scalar size of the parts don't match the vector element size, then we can't
3279   // elide the copy. The parts will have padding between them instead of being
3280   // packed like a vector.
3281   bool ScalarizedAndExtendedVector =
3282       ArgVT.isVector() && !VA.getLocVT().isVector() &&
3283       VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3284 
3285   // This is an argument in memory. We might be able to perform copy elision.
3286   // If the argument is passed directly in memory without any extension, then we
3287   // can perform copy elision. Large vector types, for example, may be passed
3288   // indirectly by pointer.
3289   if (Flags.isCopyElisionCandidate() &&
3290       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3291       !ScalarizedAndExtendedVector) {
3292     SDValue PartAddr;
3293     if (Ins[i].PartOffset == 0) {
3294       // If this is a one-part value or the first part of a multi-part value,
3295       // create a stack object for the entire argument value type and return a
3296       // load from our portion of it. This assumes that if the first part of an
3297       // argument is in memory, the rest will also be in memory.
3298       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3299                                      /*IsImmutable=*/false);
3300       PartAddr = DAG.getFrameIndex(FI, PtrVT);
3301       return DAG.getLoad(
3302           ValVT, dl, Chain, PartAddr,
3303           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3304     } else {
3305       // This is not the first piece of an argument in memory. See if there is
3306       // already a fixed stack object including this offset. If so, assume it
3307       // was created by the PartOffset == 0 branch above and create a load from
3308       // the appropriate offset into it.
3309       int64_t PartBegin = VA.getLocMemOffset();
3310       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3311       int FI = MFI.getObjectIndexBegin();
3312       for (; MFI.isFixedObjectIndex(FI); ++FI) {
3313         int64_t ObjBegin = MFI.getObjectOffset(FI);
3314         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3315         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3316           break;
3317       }
3318       if (MFI.isFixedObjectIndex(FI)) {
3319         SDValue Addr =
3320             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3321                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3322         return DAG.getLoad(
3323             ValVT, dl, Chain, Addr,
3324             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3325                                               Ins[i].PartOffset));
3326       }
3327     }
3328   }
3329 
3330   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3331                                  VA.getLocMemOffset(), isImmutable);
3332 
3333   // Set SExt or ZExt flag.
3334   if (VA.getLocInfo() == CCValAssign::ZExt) {
3335     MFI.setObjectZExt(FI, true);
3336   } else if (VA.getLocInfo() == CCValAssign::SExt) {
3337     MFI.setObjectSExt(FI, true);
3338   }
3339 
3340   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3341   SDValue Val = DAG.getLoad(
3342       ValVT, dl, Chain, FIN,
3343       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3344   return ExtendedInMem
3345              ? (VA.getValVT().isVector()
3346                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3347                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3348              : Val;
3349 }
3350 
3351 // FIXME: Get this from tablegen.
3352 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3353                                                 const X86Subtarget &Subtarget) {
3354   assert(Subtarget.is64Bit());
3355 
3356   if (Subtarget.isCallingConvWin64(CallConv)) {
3357     static const MCPhysReg GPR64ArgRegsWin64[] = {
3358       X86::RCX, X86::RDX, X86::R8,  X86::R9
3359     };
3360     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3361   }
3362 
3363   static const MCPhysReg GPR64ArgRegs64Bit[] = {
3364     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3365   };
3366   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3367 }
3368 
3369 // FIXME: Get this from tablegen.
3370 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3371                                                 CallingConv::ID CallConv,
3372                                                 const X86Subtarget &Subtarget) {
3373   assert(Subtarget.is64Bit());
3374   if (Subtarget.isCallingConvWin64(CallConv)) {
3375     // The XMM registers which might contain var arg parameters are shadowed
3376     // in their paired GPR.  So we only need to save the GPR to their home
3377     // slots.
3378     // TODO: __vectorcall will change this.
3379     return None;
3380   }
3381 
3382   bool isSoftFloat = Subtarget.useSoftFloat();
3383   if (isSoftFloat || !Subtarget.hasSSE1())
3384     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385     // registers.
3386     return None;
3387 
3388   static const MCPhysReg XMMArgRegs64Bit[] = {
3389     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391   };
3392   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393 }
3394 
3395 #ifndef NDEBUG
3396 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397   return llvm::is_sorted(
3398       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399         return A.getValNo() < B.getValNo();
3400       });
3401 }
3402 #endif
3403 
3404 namespace {
3405 /// This is a helper class for lowering variable arguments parameters.
3406 class VarArgsLoweringHelper {
3407 public:
3408   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410                         CallingConv::ID CallConv, CCState &CCInfo)
3411       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412         TheMachineFunction(DAG.getMachineFunction()),
3413         TheFunction(TheMachineFunction.getFunction()),
3414         FrameInfo(TheMachineFunction.getFrameInfo()),
3415         FrameLowering(*Subtarget.getFrameLowering()),
3416         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417         CCInfo(CCInfo) {}
3418 
3419   // Lower variable arguments parameters.
3420   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421 
3422 private:
3423   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424 
3425   void forwardMustTailParameters(SDValue &Chain);
3426 
3427   bool is64Bit() const { return Subtarget.is64Bit(); }
3428   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429 
3430   X86MachineFunctionInfo *FuncInfo;
3431   const SDLoc &DL;
3432   SelectionDAG &DAG;
3433   const X86Subtarget &Subtarget;
3434   MachineFunction &TheMachineFunction;
3435   const Function &TheFunction;
3436   MachineFrameInfo &FrameInfo;
3437   const TargetFrameLowering &FrameLowering;
3438   const TargetLowering &TargLowering;
3439   CallingConv::ID CallConv;
3440   CCState &CCInfo;
3441 };
3442 } // namespace
3443 
3444 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445     SDValue &Chain, unsigned StackSize) {
3446   // If the function takes variable number of arguments, make a frame index for
3447   // the start of the first vararg value... for expansion of llvm.va_start. We
3448   // can skip this if there are no va_start calls.
3449   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450                     CallConv != CallingConv::X86_ThisCall)) {
3451     FuncInfo->setVarArgsFrameIndex(
3452         FrameInfo.CreateFixedObject(1, StackSize, true));
3453   }
3454 
3455   // 64-bit calling conventions support varargs and register parameters, so we
3456   // have to do extra work to spill them in the prologue.
3457   if (is64Bit()) {
3458     // Find the first unallocated argument registers.
3459     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3460     ArrayRef<MCPhysReg> ArgXMMs =
3461         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3462     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3463     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3464 
3465     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3466            "SSE register cannot be used when SSE is disabled!");
3467 
3468     if (isWin64()) {
3469       // Get to the caller-allocated home save location.  Add 8 to account
3470       // for the return address.
3471       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3472       FuncInfo->setRegSaveFrameIndex(
3473           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3474       // Fixup to set vararg frame on shadow area (4 x i64).
3475       if (NumIntRegs < 4)
3476         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3477     } else {
3478       // For X86-64, if there are vararg parameters that are passed via
3479       // registers, then we must store them to their spots on the stack so
3480       // they may be loaded by dereferencing the result of va_next.
3481       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3482       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3483       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3484           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3485     }
3486 
3487     SmallVector<SDValue, 6>
3488         LiveGPRs; // list of SDValue for GPR registers keeping live input value
3489     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3490                                          // keeping live input value
3491     SDValue ALVal; // if applicable keeps SDValue for %al register
3492 
3493     // Gather all the live in physical registers.
3494     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3495       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3496       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3497     }
3498     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3499     if (!AvailableXmms.empty()) {
3500       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3501       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3502       for (MCPhysReg Reg : AvailableXmms) {
3503         // FastRegisterAllocator spills virtual registers at basic
3504         // block boundary. That leads to usages of xmm registers
3505         // outside of check for %al. Pass physical registers to
3506         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3507         TheMachineFunction.getRegInfo().addLiveIn(Reg);
3508         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3509       }
3510     }
3511 
3512     // Store the integer parameter registers.
3513     SmallVector<SDValue, 8> MemOps;
3514     SDValue RSFIN =
3515         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3516                           TargLowering.getPointerTy(DAG.getDataLayout()));
3517     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3518     for (SDValue Val : LiveGPRs) {
3519       SDValue FIN = DAG.getNode(ISD::ADD, DL,
3520                                 TargLowering.getPointerTy(DAG.getDataLayout()),
3521                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3522       SDValue Store =
3523           DAG.getStore(Val.getValue(1), DL, Val, FIN,
3524                        MachinePointerInfo::getFixedStack(
3525                            DAG.getMachineFunction(),
3526                            FuncInfo->getRegSaveFrameIndex(), Offset));
3527       MemOps.push_back(Store);
3528       Offset += 8;
3529     }
3530 
3531     // Now store the XMM (fp + vector) parameter registers.
3532     if (!LiveXMMRegs.empty()) {
3533       SmallVector<SDValue, 12> SaveXMMOps;
3534       SaveXMMOps.push_back(Chain);
3535       SaveXMMOps.push_back(ALVal);
3536       SaveXMMOps.push_back(
3537           DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3538       SaveXMMOps.push_back(
3539           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3540       llvm::append_range(SaveXMMOps, LiveXMMRegs);
3541       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3542                                    MVT::Other, SaveXMMOps));
3543     }
3544 
3545     if (!MemOps.empty())
3546       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3547   }
3548 }
3549 
3550 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3551   // Find the largest legal vector type.
3552   MVT VecVT = MVT::Other;
3553   // FIXME: Only some x86_32 calling conventions support AVX512.
3554   if (Subtarget.useAVX512Regs() &&
3555       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3556                      CallConv == CallingConv::Intel_OCL_BI)))
3557     VecVT = MVT::v16f32;
3558   else if (Subtarget.hasAVX())
3559     VecVT = MVT::v8f32;
3560   else if (Subtarget.hasSSE2())
3561     VecVT = MVT::v4f32;
3562 
3563   // We forward some GPRs and some vector types.
3564   SmallVector<MVT, 2> RegParmTypes;
3565   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3566   RegParmTypes.push_back(IntVT);
3567   if (VecVT != MVT::Other)
3568     RegParmTypes.push_back(VecVT);
3569 
3570   // Compute the set of forwarded registers. The rest are scratch.
3571   SmallVectorImpl<ForwardedRegister> &Forwards =
3572       FuncInfo->getForwardedMustTailRegParms();
3573   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3574 
3575   // Forward AL for SysV x86_64 targets, since it is used for varargs.
3576   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3577     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3578     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3579   }
3580 
3581   // Copy all forwards from physical to virtual registers.
3582   for (ForwardedRegister &FR : Forwards) {
3583     // FIXME: Can we use a less constrained schedule?
3584     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3585     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3586         TargLowering.getRegClassFor(FR.VT));
3587     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3588   }
3589 }
3590 
3591 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3592                                                    unsigned StackSize) {
3593   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3594   // If necessary, it would be set into the correct value later.
3595   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3596   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3597 
3598   if (FrameInfo.hasVAStart())
3599     createVarArgAreaAndStoreRegisters(Chain, StackSize);
3600 
3601   if (FrameInfo.hasMustTailInVarArgFunc())
3602     forwardMustTailParameters(Chain);
3603 }
3604 
3605 SDValue X86TargetLowering::LowerFormalArguments(
3606     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3607     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3608     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3609   MachineFunction &MF = DAG.getMachineFunction();
3610   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3611 
3612   const Function &F = MF.getFunction();
3613   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3614       F.getName() == "main")
3615     FuncInfo->setForceFramePointer(true);
3616 
3617   MachineFrameInfo &MFI = MF.getFrameInfo();
3618   bool Is64Bit = Subtarget.is64Bit();
3619   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3620 
3621   assert(
3622       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
3623       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3624 
3625   // Assign locations to all of the incoming arguments.
3626   SmallVector<CCValAssign, 16> ArgLocs;
3627   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3628 
3629   // Allocate shadow area for Win64.
3630   if (IsWin64)
3631     CCInfo.AllocateStack(32, Align(8));
3632 
3633   CCInfo.AnalyzeArguments(Ins, CC_X86);
3634 
3635   // In vectorcall calling convention a second pass is required for the HVA
3636   // types.
3637   if (CallingConv::X86_VectorCall == CallConv) {
3638     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3639   }
3640 
3641   // The next loop assumes that the locations are in the same order of the
3642   // input arguments.
3643   assert(isSortedByValueNo(ArgLocs) &&
3644          "Argument Location list must be sorted before lowering");
3645 
3646   SDValue ArgValue;
3647   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3648        ++I, ++InsIndex) {
3649     assert(InsIndex < Ins.size() && "Invalid Ins index");
3650     CCValAssign &VA = ArgLocs[I];
3651 
3652     if (VA.isRegLoc()) {
3653       EVT RegVT = VA.getLocVT();
3654       if (VA.needsCustom()) {
3655         assert(
3656             VA.getValVT() == MVT::v64i1 &&
3657             "Currently the only custom case is when we split v64i1 to 2 regs");
3658 
3659         // v64i1 values, in regcall calling convention, that are
3660         // compiled to 32 bit arch, are split up into two registers.
3661         ArgValue =
3662             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3663       } else {
3664         const TargetRegisterClass *RC;
3665         if (RegVT == MVT::i8)
3666           RC = &X86::GR8RegClass;
3667         else if (RegVT == MVT::i16)
3668           RC = &X86::GR16RegClass;
3669         else if (RegVT == MVT::i32)
3670           RC = &X86::GR32RegClass;
3671         else if (Is64Bit && RegVT == MVT::i64)
3672           RC = &X86::GR64RegClass;
3673         else if (RegVT == MVT::f32)
3674           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3675         else if (RegVT == MVT::f64)
3676           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3677         else if (RegVT == MVT::f80)
3678           RC = &X86::RFP80RegClass;
3679         else if (RegVT == MVT::f128)
3680           RC = &X86::VR128RegClass;
3681         else if (RegVT.is512BitVector())
3682           RC = &X86::VR512RegClass;
3683         else if (RegVT.is256BitVector())
3684           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3685         else if (RegVT.is128BitVector())
3686           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3687         else if (RegVT == MVT::x86mmx)
3688           RC = &X86::VR64RegClass;
3689         else if (RegVT == MVT::v1i1)
3690           RC = &X86::VK1RegClass;
3691         else if (RegVT == MVT::v8i1)
3692           RC = &X86::VK8RegClass;
3693         else if (RegVT == MVT::v16i1)
3694           RC = &X86::VK16RegClass;
3695         else if (RegVT == MVT::v32i1)
3696           RC = &X86::VK32RegClass;
3697         else if (RegVT == MVT::v64i1)
3698           RC = &X86::VK64RegClass;
3699         else
3700           llvm_unreachable("Unknown argument type!");
3701 
3702         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3703         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3704       }
3705 
3706       // If this is an 8 or 16-bit value, it is really passed promoted to 32
3707       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3708       // right size.
3709       if (VA.getLocInfo() == CCValAssign::SExt)
3710         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3711                                DAG.getValueType(VA.getValVT()));
3712       else if (VA.getLocInfo() == CCValAssign::ZExt)
3713         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3714                                DAG.getValueType(VA.getValVT()));
3715       else if (VA.getLocInfo() == CCValAssign::BCvt)
3716         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3717 
3718       if (VA.isExtInLoc()) {
3719         // Handle MMX values passed in XMM regs.
3720         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3721           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3722         else if (VA.getValVT().isVector() &&
3723                  VA.getValVT().getScalarType() == MVT::i1 &&
3724                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3725                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3726           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3727           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3728         } else
3729           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3730       }
3731     } else {
3732       assert(VA.isMemLoc());
3733       ArgValue =
3734           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3735     }
3736 
3737     // If value is passed via pointer - do a load.
3738     if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3739       ArgValue =
3740           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3741 
3742     InVals.push_back(ArgValue);
3743   }
3744 
3745   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3746     if (Ins[I].Flags.isSwiftAsync()) {
3747       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3748       if (Subtarget.is64Bit())
3749         X86FI->setHasSwiftAsyncContext(true);
3750       else {
3751         int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3752         X86FI->setSwiftAsyncContextFrameIdx(FI);
3753         SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3754                                   DAG.getFrameIndex(FI, MVT::i32),
3755                                   MachinePointerInfo::getFixedStack(MF, FI));
3756         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3757       }
3758     }
3759 
3760     // Swift calling convention does not require we copy the sret argument
3761     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3762     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3763       continue;
3764 
3765     // All x86 ABIs require that for returning structs by value we copy the
3766     // sret argument into %rax/%eax (depending on ABI) for the return. Save
3767     // the argument into a virtual register so that we can access it from the
3768     // return points.
3769     if (Ins[I].Flags.isSRet()) {
3770       Register Reg = FuncInfo->getSRetReturnReg();
3771       if (!Reg) {
3772         MVT PtrTy = getPointerTy(DAG.getDataLayout());
3773         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3774         FuncInfo->setSRetReturnReg(Reg);
3775       }
3776       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3777       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3778       break;
3779     }
3780   }
3781 
3782   unsigned StackSize = CCInfo.getNextStackOffset();
3783   // Align stack specially for tail calls.
3784   if (shouldGuaranteeTCO(CallConv,
3785                          MF.getTarget().Options.GuaranteedTailCallOpt))
3786     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3787 
3788   if (IsVarArg)
3789     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3790         .lowerVarArgsParameters(Chain, StackSize);
3791 
3792   // Some CCs need callee pop.
3793   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3794                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
3795     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3796   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3797     // X86 interrupts must pop the error code (and the alignment padding) if
3798     // present.
3799     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3800   } else {
3801     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3802     // If this is an sret function, the return should pop the hidden pointer.
3803     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3804         !Subtarget.getTargetTriple().isOSMSVCRT() &&
3805         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3806       FuncInfo->setBytesToPopOnReturn(4);
3807   }
3808 
3809   if (!Is64Bit) {
3810     // RegSaveFrameIndex is X86-64 only.
3811     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3812   }
3813 
3814   FuncInfo->setArgumentStackSize(StackSize);
3815 
3816   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3817     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3818     if (Personality == EHPersonality::CoreCLR) {
3819       assert(Is64Bit);
3820       // TODO: Add a mechanism to frame lowering that will allow us to indicate
3821       // that we'd prefer this slot be allocated towards the bottom of the frame
3822       // (i.e. near the stack pointer after allocating the frame).  Every
3823       // funclet needs a copy of this slot in its (mostly empty) frame, and the
3824       // offset from the bottom of this and each funclet's frame must be the
3825       // same, so the size of funclets' (mostly empty) frames is dictated by
3826       // how far this slot is from the bottom (since they allocate just enough
3827       // space to accommodate holding this slot at the correct offset).
3828       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3829       EHInfo->PSPSymFrameIdx = PSPSymFI;
3830     }
3831   }
3832 
3833   if (CallConv == CallingConv::X86_RegCall ||
3834       F.hasFnAttribute("no_caller_saved_registers")) {
3835     MachineRegisterInfo &MRI = MF.getRegInfo();
3836     for (std::pair<Register, Register> Pair : MRI.liveins())
3837       MRI.disableCalleeSavedRegister(Pair.first);
3838   }
3839 
3840   return Chain;
3841 }
3842 
3843 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3844                                             SDValue Arg, const SDLoc &dl,
3845                                             SelectionDAG &DAG,
3846                                             const CCValAssign &VA,
3847                                             ISD::ArgFlagsTy Flags,
3848                                             bool isByVal) const {
3849   unsigned LocMemOffset = VA.getLocMemOffset();
3850   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3851   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3852                        StackPtr, PtrOff);
3853   if (isByVal)
3854     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3855 
3856   return DAG.getStore(
3857       Chain, dl, Arg, PtrOff,
3858       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3859 }
3860 
3861 /// Emit a load of return address if tail call
3862 /// optimization is performed and it is required.
3863 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3864     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3865     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3866   // Adjust the Return address stack slot.
3867   EVT VT = getPointerTy(DAG.getDataLayout());
3868   OutRetAddr = getReturnAddressFrameIndex(DAG);
3869 
3870   // Load the "old" Return address.
3871   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3872   return SDValue(OutRetAddr.getNode(), 1);
3873 }
3874 
3875 /// Emit a store of the return address if tail call
3876 /// optimization is performed and it is required (FPDiff!=0).
3877 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3878                                         SDValue Chain, SDValue RetAddrFrIdx,
3879                                         EVT PtrVT, unsigned SlotSize,
3880                                         int FPDiff, const SDLoc &dl) {
3881   // Store the return address to the appropriate stack slot.
3882   if (!FPDiff) return Chain;
3883   // Calculate the new stack slot for the return address.
3884   int NewReturnAddrFI =
3885     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3886                                          false);
3887   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3888   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3889                        MachinePointerInfo::getFixedStack(
3890                            DAG.getMachineFunction(), NewReturnAddrFI));
3891   return Chain;
3892 }
3893 
3894 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3895 /// operation of specified width.
3896 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3897                        SDValue V2) {
3898   unsigned NumElems = VT.getVectorNumElements();
3899   SmallVector<int, 8> Mask;
3900   Mask.push_back(NumElems);
3901   for (unsigned i = 1; i != NumElems; ++i)
3902     Mask.push_back(i);
3903   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3904 }
3905 
3906 SDValue
3907 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3908                              SmallVectorImpl<SDValue> &InVals) const {
3909   SelectionDAG &DAG                     = CLI.DAG;
3910   SDLoc &dl                             = CLI.DL;
3911   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3912   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3913   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3914   SDValue Chain                         = CLI.Chain;
3915   SDValue Callee                        = CLI.Callee;
3916   CallingConv::ID CallConv              = CLI.CallConv;
3917   bool &isTailCall                      = CLI.IsTailCall;
3918   bool isVarArg                         = CLI.IsVarArg;
3919   const auto *CB                        = CLI.CB;
3920 
3921   MachineFunction &MF = DAG.getMachineFunction();
3922   bool Is64Bit        = Subtarget.is64Bit();
3923   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3924   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3925   bool IsSibcall      = false;
3926   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3927       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3928   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3929   bool HasNCSR = (CB && isa<CallInst>(CB) &&
3930                   CB->hasFnAttr("no_caller_saved_registers"));
3931   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3932   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3933   const Module *M = MF.getMMI().getModule();
3934   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3935 
3936   MachineFunction::CallSiteInfo CSInfo;
3937   if (CallConv == CallingConv::X86_INTR)
3938     report_fatal_error("X86 interrupts may not be called directly");
3939 
3940   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3941   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
3942     // If we are using a GOT, disable tail calls to external symbols with
3943     // default visibility. Tail calling such a symbol requires using a GOT
3944     // relocation, which forces early binding of the symbol. This breaks code
3945     // that require lazy function symbol resolution. Using musttail or
3946     // GuaranteedTailCallOpt will override this.
3947     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3948     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3949                G->getGlobal()->hasDefaultVisibility()))
3950       isTailCall = false;
3951   }
3952 
3953 
3954   if (isTailCall && !IsMustTail) {
3955     // Check if it's really possible to do a tail call.
3956     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3957                     isVarArg, SR != NotStructReturn,
3958                     MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3959                     Outs, OutVals, Ins, DAG);
3960 
3961     // Sibcalls are automatically detected tailcalls which do not require
3962     // ABI changes.
3963     if (!IsGuaranteeTCO && isTailCall)
3964       IsSibcall = true;
3965 
3966     if (isTailCall)
3967       ++NumTailCalls;
3968   }
3969 
3970   if (IsMustTail && !isTailCall)
3971     report_fatal_error("failed to perform tail call elimination on a call "
3972                        "site marked musttail");
3973 
3974   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3975          "Var args not supported with calling convention fastcc, ghc or hipe");
3976 
3977   // Analyze operands of the call, assigning locations to each operand.
3978   SmallVector<CCValAssign, 16> ArgLocs;
3979   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3980 
3981   // Allocate shadow area for Win64.
3982   if (IsWin64)
3983     CCInfo.AllocateStack(32, Align(8));
3984 
3985   CCInfo.AnalyzeArguments(Outs, CC_X86);
3986 
3987   // In vectorcall calling convention a second pass is required for the HVA
3988   // types.
3989   if (CallingConv::X86_VectorCall == CallConv) {
3990     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3991   }
3992 
3993   // Get a count of how many bytes are to be pushed on the stack.
3994   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3995   if (IsSibcall)
3996     // This is a sibcall. The memory operands are available in caller's
3997     // own caller's stack.
3998     NumBytes = 0;
3999   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4000     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4001 
4002   int FPDiff = 0;
4003   if (isTailCall &&
4004       shouldGuaranteeTCO(CallConv,
4005                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
4006     // Lower arguments at fp - stackoffset + fpdiff.
4007     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4008 
4009     FPDiff = NumBytesCallerPushed - NumBytes;
4010 
4011     // Set the delta of movement of the returnaddr stackslot.
4012     // But only set if delta is greater than previous delta.
4013     if (FPDiff < X86Info->getTCReturnAddrDelta())
4014       X86Info->setTCReturnAddrDelta(FPDiff);
4015   }
4016 
4017   unsigned NumBytesToPush = NumBytes;
4018   unsigned NumBytesToPop = NumBytes;
4019 
4020   // If we have an inalloca argument, all stack space has already been allocated
4021   // for us and be right at the top of the stack.  We don't support multiple
4022   // arguments passed in memory when using inalloca.
4023   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4024     NumBytesToPush = 0;
4025     if (!ArgLocs.back().isMemLoc())
4026       report_fatal_error("cannot use inalloca attribute on a register "
4027                          "parameter");
4028     if (ArgLocs.back().getLocMemOffset() != 0)
4029       report_fatal_error("any parameter with the inalloca attribute must be "
4030                          "the only memory argument");
4031   } else if (CLI.IsPreallocated) {
4032     assert(ArgLocs.back().isMemLoc() &&
4033            "cannot use preallocated attribute on a register "
4034            "parameter");
4035     SmallVector<size_t, 4> PreallocatedOffsets;
4036     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4037       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4038         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4039       }
4040     }
4041     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4042     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4043     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4044     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4045     NumBytesToPush = 0;
4046   }
4047 
4048   if (!IsSibcall && !IsMustTail)
4049     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4050                                  NumBytes - NumBytesToPush, dl);
4051 
4052   SDValue RetAddrFrIdx;
4053   // Load return address for tail calls.
4054   if (isTailCall && FPDiff)
4055     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4056                                     Is64Bit, FPDiff, dl);
4057 
4058   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4059   SmallVector<SDValue, 8> MemOpChains;
4060   SDValue StackPtr;
4061 
4062   // The next loop assumes that the locations are in the same order of the
4063   // input arguments.
4064   assert(isSortedByValueNo(ArgLocs) &&
4065          "Argument Location list must be sorted before lowering");
4066 
4067   // Walk the register/memloc assignments, inserting copies/loads.  In the case
4068   // of tail call optimization arguments are handle later.
4069   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4070   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4071        ++I, ++OutIndex) {
4072     assert(OutIndex < Outs.size() && "Invalid Out index");
4073     // Skip inalloca/preallocated arguments, they have already been written.
4074     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4075     if (Flags.isInAlloca() || Flags.isPreallocated())
4076       continue;
4077 
4078     CCValAssign &VA = ArgLocs[I];
4079     EVT RegVT = VA.getLocVT();
4080     SDValue Arg = OutVals[OutIndex];
4081     bool isByVal = Flags.isByVal();
4082 
4083     // Promote the value if needed.
4084     switch (VA.getLocInfo()) {
4085     default: llvm_unreachable("Unknown loc info!");
4086     case CCValAssign::Full: break;
4087     case CCValAssign::SExt:
4088       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4089       break;
4090     case CCValAssign::ZExt:
4091       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4092       break;
4093     case CCValAssign::AExt:
4094       if (Arg.getValueType().isVector() &&
4095           Arg.getValueType().getVectorElementType() == MVT::i1)
4096         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4097       else if (RegVT.is128BitVector()) {
4098         // Special case: passing MMX values in XMM registers.
4099         Arg = DAG.getBitcast(MVT::i64, Arg);
4100         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4101         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4102       } else
4103         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4104       break;
4105     case CCValAssign::BCvt:
4106       Arg = DAG.getBitcast(RegVT, Arg);
4107       break;
4108     case CCValAssign::Indirect: {
4109       if (isByVal) {
4110         // Memcpy the argument to a temporary stack slot to prevent
4111         // the caller from seeing any modifications the callee may make
4112         // as guaranteed by the `byval` attribute.
4113         int FrameIdx = MF.getFrameInfo().CreateStackObject(
4114             Flags.getByValSize(),
4115             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4116         SDValue StackSlot =
4117             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4118         Chain =
4119             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4120         // From now on treat this as a regular pointer
4121         Arg = StackSlot;
4122         isByVal = false;
4123       } else {
4124         // Store the argument.
4125         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4126         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4127         Chain = DAG.getStore(
4128             Chain, dl, Arg, SpillSlot,
4129             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4130         Arg = SpillSlot;
4131       }
4132       break;
4133     }
4134     }
4135 
4136     if (VA.needsCustom()) {
4137       assert(VA.getValVT() == MVT::v64i1 &&
4138              "Currently the only custom case is when we split v64i1 to 2 regs");
4139       // Split v64i1 value into two registers
4140       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4141     } else if (VA.isRegLoc()) {
4142       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4143       const TargetOptions &Options = DAG.getTarget().Options;
4144       if (Options.EmitCallSiteInfo)
4145         CSInfo.emplace_back(VA.getLocReg(), I);
4146       if (isVarArg && IsWin64) {
4147         // Win64 ABI requires argument XMM reg to be copied to the corresponding
4148         // shadow reg if callee is a varargs function.
4149         Register ShadowReg;
4150         switch (VA.getLocReg()) {
4151         case X86::XMM0: ShadowReg = X86::RCX; break;
4152         case X86::XMM1: ShadowReg = X86::RDX; break;
4153         case X86::XMM2: ShadowReg = X86::R8; break;
4154         case X86::XMM3: ShadowReg = X86::R9; break;
4155         }
4156         if (ShadowReg)
4157           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4158       }
4159     } else if (!IsSibcall && (!isTailCall || isByVal)) {
4160       assert(VA.isMemLoc());
4161       if (!StackPtr.getNode())
4162         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4163                                       getPointerTy(DAG.getDataLayout()));
4164       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4165                                              dl, DAG, VA, Flags, isByVal));
4166     }
4167   }
4168 
4169   if (!MemOpChains.empty())
4170     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4171 
4172   if (Subtarget.isPICStyleGOT()) {
4173     // ELF / PIC requires GOT in the EBX register before function calls via PLT
4174     // GOT pointer (except regcall).
4175     if (!isTailCall) {
4176       // Indirect call with RegCall calling convertion may use up all the
4177       // general registers, so it is not suitable to bind EBX reister for
4178       // GOT address, just let register allocator handle it.
4179       if (CallConv != CallingConv::X86_RegCall)
4180         RegsToPass.push_back(std::make_pair(
4181           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4182                                           getPointerTy(DAG.getDataLayout()))));
4183     } else {
4184       // If we are tail calling and generating PIC/GOT style code load the
4185       // address of the callee into ECX. The value in ecx is used as target of
4186       // the tail jump. This is done to circumvent the ebx/callee-saved problem
4187       // for tail calls on PIC/GOT architectures. Normally we would just put the
4188       // address of GOT into ebx and then call target@PLT. But for tail calls
4189       // ebx would be restored (since ebx is callee saved) before jumping to the
4190       // target@PLT.
4191 
4192       // Note: The actual moving to ECX is done further down.
4193       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4194       if (G && !G->getGlobal()->hasLocalLinkage() &&
4195           G->getGlobal()->hasDefaultVisibility())
4196         Callee = LowerGlobalAddress(Callee, DAG);
4197       else if (isa<ExternalSymbolSDNode>(Callee))
4198         Callee = LowerExternalSymbol(Callee, DAG);
4199     }
4200   }
4201 
4202   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4203     // From AMD64 ABI document:
4204     // For calls that may call functions that use varargs or stdargs
4205     // (prototype-less calls or calls to functions containing ellipsis (...) in
4206     // the declaration) %al is used as hidden argument to specify the number
4207     // of SSE registers used. The contents of %al do not need to match exactly
4208     // the number of registers, but must be an ubound on the number of SSE
4209     // registers used and is in the range 0 - 8 inclusive.
4210 
4211     // Count the number of XMM registers allocated.
4212     static const MCPhysReg XMMArgRegs[] = {
4213       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4214       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4215     };
4216     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4217     assert((Subtarget.hasSSE1() || !NumXMMRegs)
4218            && "SSE registers cannot be used when SSE is disabled");
4219     RegsToPass.push_back(std::make_pair(Register(X86::AL),
4220                                         DAG.getConstant(NumXMMRegs, dl,
4221                                                         MVT::i8)));
4222   }
4223 
4224   if (isVarArg && IsMustTail) {
4225     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4226     for (const auto &F : Forwards) {
4227       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4228       RegsToPass.push_back(std::make_pair(F.PReg, Val));
4229     }
4230   }
4231 
4232   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
4233   // don't need this because the eligibility check rejects calls that require
4234   // shuffling arguments passed in memory.
4235   if (!IsSibcall && isTailCall) {
4236     // Force all the incoming stack arguments to be loaded from the stack
4237     // before any new outgoing arguments are stored to the stack, because the
4238     // outgoing stack slots may alias the incoming argument stack slots, and
4239     // the alias isn't otherwise explicit. This is slightly more conservative
4240     // than necessary, because it means that each store effectively depends
4241     // on every argument instead of just those arguments it would clobber.
4242     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4243 
4244     SmallVector<SDValue, 8> MemOpChains2;
4245     SDValue FIN;
4246     int FI = 0;
4247     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4248          ++I, ++OutsIndex) {
4249       CCValAssign &VA = ArgLocs[I];
4250 
4251       if (VA.isRegLoc()) {
4252         if (VA.needsCustom()) {
4253           assert((CallConv == CallingConv::X86_RegCall) &&
4254                  "Expecting custom case only in regcall calling convention");
4255           // This means that we are in special case where one argument was
4256           // passed through two register locations - Skip the next location
4257           ++I;
4258         }
4259 
4260         continue;
4261       }
4262 
4263       assert(VA.isMemLoc());
4264       SDValue Arg = OutVals[OutsIndex];
4265       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4266       // Skip inalloca/preallocated arguments.  They don't require any work.
4267       if (Flags.isInAlloca() || Flags.isPreallocated())
4268         continue;
4269       // Create frame index.
4270       int32_t Offset = VA.getLocMemOffset()+FPDiff;
4271       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4272       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4273       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4274 
4275       if (Flags.isByVal()) {
4276         // Copy relative to framepointer.
4277         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4278         if (!StackPtr.getNode())
4279           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4280                                         getPointerTy(DAG.getDataLayout()));
4281         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4282                              StackPtr, Source);
4283 
4284         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4285                                                          ArgChain,
4286                                                          Flags, DAG, dl));
4287       } else {
4288         // Store relative to framepointer.
4289         MemOpChains2.push_back(DAG.getStore(
4290             ArgChain, dl, Arg, FIN,
4291             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4292       }
4293     }
4294 
4295     if (!MemOpChains2.empty())
4296       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4297 
4298     // Store the return address to the appropriate stack slot.
4299     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4300                                      getPointerTy(DAG.getDataLayout()),
4301                                      RegInfo->getSlotSize(), FPDiff, dl);
4302   }
4303 
4304   // Build a sequence of copy-to-reg nodes chained together with token chain
4305   // and flag operands which copy the outgoing args into registers.
4306   SDValue InFlag;
4307   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4308     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4309                              RegsToPass[i].second, InFlag);
4310     InFlag = Chain.getValue(1);
4311   }
4312 
4313   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4314     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
4315     // In the 64-bit large code model, we have to make all calls
4316     // through a register, since the call instruction's 32-bit
4317     // pc-relative offset may not be large enough to hold the whole
4318     // address.
4319   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4320              Callee->getOpcode() == ISD::ExternalSymbol) {
4321     // Lower direct calls to global addresses and external symbols. Setting
4322     // ForCall to true here has the effect of removing WrapperRIP when possible
4323     // to allow direct calls to be selected without first materializing the
4324     // address into a register.
4325     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4326   } else if (Subtarget.isTarget64BitILP32() &&
4327              Callee->getValueType(0) == MVT::i32) {
4328     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4329     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4330   }
4331 
4332   // Returns a chain & a flag for retval copy to use.
4333   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4334   SmallVector<SDValue, 8> Ops;
4335 
4336   if (!IsSibcall && isTailCall && !IsMustTail) {
4337     Chain = DAG.getCALLSEQ_END(Chain,
4338                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4339                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4340     InFlag = Chain.getValue(1);
4341   }
4342 
4343   Ops.push_back(Chain);
4344   Ops.push_back(Callee);
4345 
4346   if (isTailCall)
4347     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4348 
4349   // Add argument registers to the end of the list so that they are known live
4350   // into the call.
4351   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4352     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4353                                   RegsToPass[i].second.getValueType()));
4354 
4355   // Add a register mask operand representing the call-preserved registers.
4356   const uint32_t *Mask = [&]() {
4357     auto AdaptedCC = CallConv;
4358     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4359     // use X86_INTR calling convention because it has the same CSR mask
4360     // (same preserved registers).
4361     if (HasNCSR)
4362       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4363     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4364     // to use the CSR_NoRegs_RegMask.
4365     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4366       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4367     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4368   }();
4369   assert(Mask && "Missing call preserved mask for calling convention");
4370 
4371   // If this is an invoke in a 32-bit function using a funclet-based
4372   // personality, assume the function clobbers all registers. If an exception
4373   // is thrown, the runtime will not restore CSRs.
4374   // FIXME: Model this more precisely so that we can register allocate across
4375   // the normal edge and spill and fill across the exceptional edge.
4376   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4377     const Function &CallerFn = MF.getFunction();
4378     EHPersonality Pers =
4379         CallerFn.hasPersonalityFn()
4380             ? classifyEHPersonality(CallerFn.getPersonalityFn())
4381             : EHPersonality::Unknown;
4382     if (isFuncletEHPersonality(Pers))
4383       Mask = RegInfo->getNoPreservedMask();
4384   }
4385 
4386   // Define a new register mask from the existing mask.
4387   uint32_t *RegMask = nullptr;
4388 
4389   // In some calling conventions we need to remove the used physical registers
4390   // from the reg mask.
4391   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4392     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4393 
4394     // Allocate a new Reg Mask and copy Mask.
4395     RegMask = MF.allocateRegMask();
4396     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4397     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4398 
4399     // Make sure all sub registers of the argument registers are reset
4400     // in the RegMask.
4401     for (auto const &RegPair : RegsToPass)
4402       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4403            SubRegs.isValid(); ++SubRegs)
4404         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4405 
4406     // Create the RegMask Operand according to our updated mask.
4407     Ops.push_back(DAG.getRegisterMask(RegMask));
4408   } else {
4409     // Create the RegMask Operand according to the static mask.
4410     Ops.push_back(DAG.getRegisterMask(Mask));
4411   }
4412 
4413   if (InFlag.getNode())
4414     Ops.push_back(InFlag);
4415 
4416   if (isTailCall) {
4417     // We used to do:
4418     //// If this is the first return lowered for this function, add the regs
4419     //// to the liveout set for the function.
4420     // This isn't right, although it's probably harmless on x86; liveouts
4421     // should be computed from returns not tail calls.  Consider a void
4422     // function making a tail call to a function returning int.
4423     MF.getFrameInfo().setHasTailCall();
4424     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4425     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4426     return Ret;
4427   }
4428 
4429   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4430     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4431   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4432     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4433     // expanded to the call, directly followed by a special marker sequence and
4434     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4435     assert(!isTailCall &&
4436            "tail calls cannot be marked with clang.arc.attachedcall");
4437     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
4438 
4439     // Add target constant to select ObjC runtime call just before the call
4440     // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4441     // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4442     // epxanding the pseudo.
4443     unsigned RuntimeCallType =
4444         objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4445     Ops.insert(Ops.begin() + 1,
4446                DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4447     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4448   } else {
4449     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4450   }
4451 
4452   InFlag = Chain.getValue(1);
4453   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4454   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4455 
4456   // Save heapallocsite metadata.
4457   if (CLI.CB)
4458     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4459       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4460 
4461   // Create the CALLSEQ_END node.
4462   unsigned NumBytesForCalleeToPop;
4463   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4464                        DAG.getTarget().Options.GuaranteedTailCallOpt))
4465     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
4466   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4467            !Subtarget.getTargetTriple().isOSMSVCRT() &&
4468            SR == StackStructReturn)
4469     // If this is a call to a struct-return function, the callee
4470     // pops the hidden struct pointer, so we have to push it back.
4471     // This is common for Darwin/X86, Linux & Mingw32 targets.
4472     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4473     NumBytesForCalleeToPop = 4;
4474   else
4475     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
4476 
4477   // Returns a flag for retval copy to use.
4478   if (!IsSibcall) {
4479     Chain = DAG.getCALLSEQ_END(Chain,
4480                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4481                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4482                                                      true),
4483                                InFlag, dl);
4484     InFlag = Chain.getValue(1);
4485   }
4486 
4487   // Handle result values, copying them out of physregs into vregs that we
4488   // return.
4489   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4490                          InVals, RegMask);
4491 }
4492 
4493 //===----------------------------------------------------------------------===//
4494 //                Fast Calling Convention (tail call) implementation
4495 //===----------------------------------------------------------------------===//
4496 
4497 //  Like std call, callee cleans arguments, convention except that ECX is
4498 //  reserved for storing the tail called function address. Only 2 registers are
4499 //  free for argument passing (inreg). Tail call optimization is performed
4500 //  provided:
4501 //                * tailcallopt is enabled
4502 //                * caller/callee are fastcc
4503 //  On X86_64 architecture with GOT-style position independent code only local
4504 //  (within module) calls are supported at the moment.
4505 //  To keep the stack aligned according to platform abi the function
4506 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
4507 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4508 //  If a tail called function callee has more arguments than the caller the
4509 //  caller needs to make sure that there is room to move the RETADDR to. This is
4510 //  achieved by reserving an area the size of the argument delta right after the
4511 //  original RETADDR, but before the saved framepointer or the spilled registers
4512 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4513 //  stack layout:
4514 //    arg1
4515 //    arg2
4516 //    RETADDR
4517 //    [ new RETADDR
4518 //      move area ]
4519 //    (possible EBP)
4520 //    ESI
4521 //    EDI
4522 //    local1 ..
4523 
4524 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4525 /// requirement.
4526 unsigned
4527 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4528                                                SelectionDAG &DAG) const {
4529   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4530   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4531   assert(StackSize % SlotSize == 0 &&
4532          "StackSize must be a multiple of SlotSize");
4533   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4534 }
4535 
4536 /// Return true if the given stack call argument is already available in the
4537 /// same position (relatively) of the caller's incoming argument stack.
4538 static
4539 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4540                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4541                          const X86InstrInfo *TII, const CCValAssign &VA) {
4542   unsigned Bytes = Arg.getValueSizeInBits() / 8;
4543 
4544   for (;;) {
4545     // Look through nodes that don't alter the bits of the incoming value.
4546     unsigned Op = Arg.getOpcode();
4547     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4548       Arg = Arg.getOperand(0);
4549       continue;
4550     }
4551     if (Op == ISD::TRUNCATE) {
4552       const SDValue &TruncInput = Arg.getOperand(0);
4553       if (TruncInput.getOpcode() == ISD::AssertZext &&
4554           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4555               Arg.getValueType()) {
4556         Arg = TruncInput.getOperand(0);
4557         continue;
4558       }
4559     }
4560     break;
4561   }
4562 
4563   int FI = INT_MAX;
4564   if (Arg.getOpcode() == ISD::CopyFromReg) {
4565     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4566     if (!VR.isVirtual())
4567       return false;
4568     MachineInstr *Def = MRI->getVRegDef(VR);
4569     if (!Def)
4570       return false;
4571     if (!Flags.isByVal()) {
4572       if (!TII->isLoadFromStackSlot(*Def, FI))
4573         return false;
4574     } else {
4575       unsigned Opcode = Def->getOpcode();
4576       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4577            Opcode == X86::LEA64_32r) &&
4578           Def->getOperand(1).isFI()) {
4579         FI = Def->getOperand(1).getIndex();
4580         Bytes = Flags.getByValSize();
4581       } else
4582         return false;
4583     }
4584   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4585     if (Flags.isByVal())
4586       // ByVal argument is passed in as a pointer but it's now being
4587       // dereferenced. e.g.
4588       // define @foo(%struct.X* %A) {
4589       //   tail call @bar(%struct.X* byval %A)
4590       // }
4591       return false;
4592     SDValue Ptr = Ld->getBasePtr();
4593     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4594     if (!FINode)
4595       return false;
4596     FI = FINode->getIndex();
4597   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4598     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4599     FI = FINode->getIndex();
4600     Bytes = Flags.getByValSize();
4601   } else
4602     return false;
4603 
4604   assert(FI != INT_MAX);
4605   if (!MFI.isFixedObjectIndex(FI))
4606     return false;
4607 
4608   if (Offset != MFI.getObjectOffset(FI))
4609     return false;
4610 
4611   // If this is not byval, check that the argument stack object is immutable.
4612   // inalloca and argument copy elision can create mutable argument stack
4613   // objects. Byval objects can be mutated, but a byval call intends to pass the
4614   // mutated memory.
4615   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4616     return false;
4617 
4618   if (VA.getLocVT().getFixedSizeInBits() >
4619       Arg.getValueSizeInBits().getFixedSize()) {
4620     // If the argument location is wider than the argument type, check that any
4621     // extension flags match.
4622     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4623         Flags.isSExt() != MFI.isObjectSExt(FI)) {
4624       return false;
4625     }
4626   }
4627 
4628   return Bytes == MFI.getObjectSize(FI);
4629 }
4630 
4631 /// Check whether the call is eligible for tail call optimization. Targets
4632 /// that want to do tail call optimization should implement this function.
4633 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4634     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4635     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4636     const SmallVectorImpl<ISD::OutputArg> &Outs,
4637     const SmallVectorImpl<SDValue> &OutVals,
4638     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4639   if (!mayTailCallThisCC(CalleeCC))
4640     return false;
4641 
4642   // If -tailcallopt is specified, make fastcc functions tail-callable.
4643   MachineFunction &MF = DAG.getMachineFunction();
4644   const Function &CallerF = MF.getFunction();
4645 
4646   // If the function return type is x86_fp80 and the callee return type is not,
4647   // then the FP_EXTEND of the call result is not a nop. It's not safe to
4648   // perform a tailcall optimization here.
4649   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4650     return false;
4651 
4652   CallingConv::ID CallerCC = CallerF.getCallingConv();
4653   bool CCMatch = CallerCC == CalleeCC;
4654   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4655   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4656   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4657       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4658 
4659   // Win64 functions have extra shadow space for argument homing. Don't do the
4660   // sibcall if the caller and callee have mismatched expectations for this
4661   // space.
4662   if (IsCalleeWin64 != IsCallerWin64)
4663     return false;
4664 
4665   if (IsGuaranteeTCO) {
4666     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4667       return true;
4668     return false;
4669   }
4670 
4671   // Look for obvious safe cases to perform tail call optimization that do not
4672   // require ABI changes. This is what gcc calls sibcall.
4673 
4674   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4675   // emit a special epilogue.
4676   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4677   if (RegInfo->hasStackRealignment(MF))
4678     return false;
4679 
4680   // Also avoid sibcall optimization if either caller or callee uses struct
4681   // return semantics.
4682   if (isCalleeStructRet || isCallerStructRet)
4683     return false;
4684 
4685   // Do not sibcall optimize vararg calls unless all arguments are passed via
4686   // registers.
4687   LLVMContext &C = *DAG.getContext();
4688   if (isVarArg && !Outs.empty()) {
4689     // Optimizing for varargs on Win64 is unlikely to be safe without
4690     // additional testing.
4691     if (IsCalleeWin64 || IsCallerWin64)
4692       return false;
4693 
4694     SmallVector<CCValAssign, 16> ArgLocs;
4695     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4696 
4697     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4698     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4699       if (!ArgLocs[i].isRegLoc())
4700         return false;
4701   }
4702 
4703   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4704   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4705   // this into a sibcall.
4706   bool Unused = false;
4707   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4708     if (!Ins[i].Used) {
4709       Unused = true;
4710       break;
4711     }
4712   }
4713   if (Unused) {
4714     SmallVector<CCValAssign, 16> RVLocs;
4715     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4716     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4717     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4718       CCValAssign &VA = RVLocs[i];
4719       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4720         return false;
4721     }
4722   }
4723 
4724   // Check that the call results are passed in the same way.
4725   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4726                                   RetCC_X86, RetCC_X86))
4727     return false;
4728   // The callee has to preserve all registers the caller needs to preserve.
4729   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4730   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4731   if (!CCMatch) {
4732     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4733     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4734       return false;
4735   }
4736 
4737   unsigned StackArgsSize = 0;
4738 
4739   // If the callee takes no arguments then go on to check the results of the
4740   // call.
4741   if (!Outs.empty()) {
4742     // Check if stack adjustment is needed. For now, do not do this if any
4743     // argument is passed on the stack.
4744     SmallVector<CCValAssign, 16> ArgLocs;
4745     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4746 
4747     // Allocate shadow area for Win64
4748     if (IsCalleeWin64)
4749       CCInfo.AllocateStack(32, Align(8));
4750 
4751     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4752     StackArgsSize = CCInfo.getNextStackOffset();
4753 
4754     if (CCInfo.getNextStackOffset()) {
4755       // Check if the arguments are already laid out in the right way as
4756       // the caller's fixed stack objects.
4757       MachineFrameInfo &MFI = MF.getFrameInfo();
4758       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4759       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4760       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4761         CCValAssign &VA = ArgLocs[i];
4762         SDValue Arg = OutVals[i];
4763         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4764         if (VA.getLocInfo() == CCValAssign::Indirect)
4765           return false;
4766         if (!VA.isRegLoc()) {
4767           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4768                                    MFI, MRI, TII, VA))
4769             return false;
4770         }
4771       }
4772     }
4773 
4774     bool PositionIndependent = isPositionIndependent();
4775     // If the tailcall address may be in a register, then make sure it's
4776     // possible to register allocate for it. In 32-bit, the call address can
4777     // only target EAX, EDX, or ECX since the tail call must be scheduled after
4778     // callee-saved registers are restored. These happen to be the same
4779     // registers used to pass 'inreg' arguments so watch out for those.
4780     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4781                                   !isa<ExternalSymbolSDNode>(Callee)) ||
4782                                  PositionIndependent)) {
4783       unsigned NumInRegs = 0;
4784       // In PIC we need an extra register to formulate the address computation
4785       // for the callee.
4786       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4787 
4788       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4789         CCValAssign &VA = ArgLocs[i];
4790         if (!VA.isRegLoc())
4791           continue;
4792         Register Reg = VA.getLocReg();
4793         switch (Reg) {
4794         default: break;
4795         case X86::EAX: case X86::EDX: case X86::ECX:
4796           if (++NumInRegs == MaxInRegs)
4797             return false;
4798           break;
4799         }
4800       }
4801     }
4802 
4803     const MachineRegisterInfo &MRI = MF.getRegInfo();
4804     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4805       return false;
4806   }
4807 
4808   bool CalleeWillPop =
4809       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4810                        MF.getTarget().Options.GuaranteedTailCallOpt);
4811 
4812   if (unsigned BytesToPop =
4813           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4814     // If we have bytes to pop, the callee must pop them.
4815     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4816     if (!CalleePopMatches)
4817       return false;
4818   } else if (CalleeWillPop && StackArgsSize > 0) {
4819     // If we don't have bytes to pop, make sure the callee doesn't pop any.
4820     return false;
4821   }
4822 
4823   return true;
4824 }
4825 
4826 FastISel *
4827 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4828                                   const TargetLibraryInfo *libInfo) const {
4829   return X86::createFastISel(funcInfo, libInfo);
4830 }
4831 
4832 //===----------------------------------------------------------------------===//
4833 //                           Other Lowering Hooks
4834 //===----------------------------------------------------------------------===//
4835 
4836 static bool MayFoldLoad(SDValue Op) {
4837   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4838 }
4839 
4840 static bool MayFoldIntoStore(SDValue Op) {
4841   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4842 }
4843 
4844 static bool MayFoldIntoZeroExtend(SDValue Op) {
4845   if (Op.hasOneUse()) {
4846     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4847     return (ISD::ZERO_EXTEND == Opcode);
4848   }
4849   return false;
4850 }
4851 
4852 static bool isTargetShuffle(unsigned Opcode) {
4853   switch(Opcode) {
4854   default: return false;
4855   case X86ISD::BLENDI:
4856   case X86ISD::PSHUFB:
4857   case X86ISD::PSHUFD:
4858   case X86ISD::PSHUFHW:
4859   case X86ISD::PSHUFLW:
4860   case X86ISD::SHUFP:
4861   case X86ISD::INSERTPS:
4862   case X86ISD::EXTRQI:
4863   case X86ISD::INSERTQI:
4864   case X86ISD::VALIGN:
4865   case X86ISD::PALIGNR:
4866   case X86ISD::VSHLDQ:
4867   case X86ISD::VSRLDQ:
4868   case X86ISD::MOVLHPS:
4869   case X86ISD::MOVHLPS:
4870   case X86ISD::MOVSHDUP:
4871   case X86ISD::MOVSLDUP:
4872   case X86ISD::MOVDDUP:
4873   case X86ISD::MOVSS:
4874   case X86ISD::MOVSD:
4875   case X86ISD::UNPCKL:
4876   case X86ISD::UNPCKH:
4877   case X86ISD::VBROADCAST:
4878   case X86ISD::VPERMILPI:
4879   case X86ISD::VPERMILPV:
4880   case X86ISD::VPERM2X128:
4881   case X86ISD::SHUF128:
4882   case X86ISD::VPERMIL2:
4883   case X86ISD::VPERMI:
4884   case X86ISD::VPPERM:
4885   case X86ISD::VPERMV:
4886   case X86ISD::VPERMV3:
4887   case X86ISD::VZEXT_MOVL:
4888     return true;
4889   }
4890 }
4891 
4892 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4893   switch (Opcode) {
4894   default: return false;
4895   // Target Shuffles.
4896   case X86ISD::PSHUFB:
4897   case X86ISD::VPERMILPV:
4898   case X86ISD::VPERMIL2:
4899   case X86ISD::VPPERM:
4900   case X86ISD::VPERMV:
4901   case X86ISD::VPERMV3:
4902     return true;
4903   // 'Faux' Target Shuffles.
4904   case ISD::OR:
4905   case ISD::AND:
4906   case X86ISD::ANDNP:
4907     return true;
4908   }
4909 }
4910 
4911 static bool isTargetShuffleSplat(SDValue Op) {
4912   unsigned Opcode = Op.getOpcode();
4913   if (Opcode == ISD::EXTRACT_SUBVECTOR)
4914     return isTargetShuffleSplat(Op.getOperand(0));
4915   return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4916 }
4917 
4918 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4919   MachineFunction &MF = DAG.getMachineFunction();
4920   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4921   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4922   int ReturnAddrIndex = FuncInfo->getRAIndex();
4923 
4924   if (ReturnAddrIndex == 0) {
4925     // Set up a frame object for the return address.
4926     unsigned SlotSize = RegInfo->getSlotSize();
4927     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4928                                                           -(int64_t)SlotSize,
4929                                                           false);
4930     FuncInfo->setRAIndex(ReturnAddrIndex);
4931   }
4932 
4933   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4934 }
4935 
4936 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4937                                        bool hasSymbolicDisplacement) {
4938   // Offset should fit into 32 bit immediate field.
4939   if (!isInt<32>(Offset))
4940     return false;
4941 
4942   // If we don't have a symbolic displacement - we don't have any extra
4943   // restrictions.
4944   if (!hasSymbolicDisplacement)
4945     return true;
4946 
4947   // FIXME: Some tweaks might be needed for medium code model.
4948   if (M != CodeModel::Small && M != CodeModel::Kernel)
4949     return false;
4950 
4951   // For small code model we assume that latest object is 16MB before end of 31
4952   // bits boundary. We may also accept pretty large negative constants knowing
4953   // that all objects are in the positive half of address space.
4954   if (M == CodeModel::Small && Offset < 16*1024*1024)
4955     return true;
4956 
4957   // For kernel code model we know that all object resist in the negative half
4958   // of 32bits address space. We may not accept negative offsets, since they may
4959   // be just off and we may accept pretty large positive ones.
4960   if (M == CodeModel::Kernel && Offset >= 0)
4961     return true;
4962 
4963   return false;
4964 }
4965 
4966 /// Determines whether the callee is required to pop its own arguments.
4967 /// Callee pop is necessary to support tail calls.
4968 bool X86::isCalleePop(CallingConv::ID CallingConv,
4969                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4970   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4971   // can guarantee TCO.
4972   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4973     return true;
4974 
4975   switch (CallingConv) {
4976   default:
4977     return false;
4978   case CallingConv::X86_StdCall:
4979   case CallingConv::X86_FastCall:
4980   case CallingConv::X86_ThisCall:
4981   case CallingConv::X86_VectorCall:
4982     return !is64Bit;
4983   }
4984 }
4985 
4986 /// Return true if the condition is an signed comparison operation.
4987 static bool isX86CCSigned(unsigned X86CC) {
4988   switch (X86CC) {
4989   default:
4990     llvm_unreachable("Invalid integer condition!");
4991   case X86::COND_E:
4992   case X86::COND_NE:
4993   case X86::COND_B:
4994   case X86::COND_A:
4995   case X86::COND_BE:
4996   case X86::COND_AE:
4997     return false;
4998   case X86::COND_G:
4999   case X86::COND_GE:
5000   case X86::COND_L:
5001   case X86::COND_LE:
5002     return true;
5003   }
5004 }
5005 
5006 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5007   switch (SetCCOpcode) {
5008   default: llvm_unreachable("Invalid integer condition!");
5009   case ISD::SETEQ:  return X86::COND_E;
5010   case ISD::SETGT:  return X86::COND_G;
5011   case ISD::SETGE:  return X86::COND_GE;
5012   case ISD::SETLT:  return X86::COND_L;
5013   case ISD::SETLE:  return X86::COND_LE;
5014   case ISD::SETNE:  return X86::COND_NE;
5015   case ISD::SETULT: return X86::COND_B;
5016   case ISD::SETUGT: return X86::COND_A;
5017   case ISD::SETULE: return X86::COND_BE;
5018   case ISD::SETUGE: return X86::COND_AE;
5019   }
5020 }
5021 
5022 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5023 /// condition code, returning the condition code and the LHS/RHS of the
5024 /// comparison to make.
5025 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5026                                bool isFP, SDValue &LHS, SDValue &RHS,
5027                                SelectionDAG &DAG) {
5028   if (!isFP) {
5029     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5030       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5031         // X > -1   -> X == 0, jump !sign.
5032         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5033         return X86::COND_NS;
5034       }
5035       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5036         // X < 0   -> X == 0, jump on sign.
5037         return X86::COND_S;
5038       }
5039       if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5040         // X >= 0   -> X == 0, jump on !sign.
5041         return X86::COND_NS;
5042       }
5043       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5044         // X < 1   -> X <= 0
5045         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5046         return X86::COND_LE;
5047       }
5048     }
5049 
5050     return TranslateIntegerX86CC(SetCCOpcode);
5051   }
5052 
5053   // First determine if it is required or is profitable to flip the operands.
5054 
5055   // If LHS is a foldable load, but RHS is not, flip the condition.
5056   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5057       !ISD::isNON_EXTLoad(RHS.getNode())) {
5058     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5059     std::swap(LHS, RHS);
5060   }
5061 
5062   switch (SetCCOpcode) {
5063   default: break;
5064   case ISD::SETOLT:
5065   case ISD::SETOLE:
5066   case ISD::SETUGT:
5067   case ISD::SETUGE:
5068     std::swap(LHS, RHS);
5069     break;
5070   }
5071 
5072   // On a floating point condition, the flags are set as follows:
5073   // ZF  PF  CF   op
5074   //  0 | 0 | 0 | X > Y
5075   //  0 | 0 | 1 | X < Y
5076   //  1 | 0 | 0 | X == Y
5077   //  1 | 1 | 1 | unordered
5078   switch (SetCCOpcode) {
5079   default: llvm_unreachable("Condcode should be pre-legalized away");
5080   case ISD::SETUEQ:
5081   case ISD::SETEQ:   return X86::COND_E;
5082   case ISD::SETOLT:              // flipped
5083   case ISD::SETOGT:
5084   case ISD::SETGT:   return X86::COND_A;
5085   case ISD::SETOLE:              // flipped
5086   case ISD::SETOGE:
5087   case ISD::SETGE:   return X86::COND_AE;
5088   case ISD::SETUGT:              // flipped
5089   case ISD::SETULT:
5090   case ISD::SETLT:   return X86::COND_B;
5091   case ISD::SETUGE:              // flipped
5092   case ISD::SETULE:
5093   case ISD::SETLE:   return X86::COND_BE;
5094   case ISD::SETONE:
5095   case ISD::SETNE:   return X86::COND_NE;
5096   case ISD::SETUO:   return X86::COND_P;
5097   case ISD::SETO:    return X86::COND_NP;
5098   case ISD::SETOEQ:
5099   case ISD::SETUNE:  return X86::COND_INVALID;
5100   }
5101 }
5102 
5103 /// Is there a floating point cmov for the specific X86 condition code?
5104 /// Current x86 isa includes the following FP cmov instructions:
5105 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5106 static bool hasFPCMov(unsigned X86CC) {
5107   switch (X86CC) {
5108   default:
5109     return false;
5110   case X86::COND_B:
5111   case X86::COND_BE:
5112   case X86::COND_E:
5113   case X86::COND_P:
5114   case X86::COND_A:
5115   case X86::COND_AE:
5116   case X86::COND_NE:
5117   case X86::COND_NP:
5118     return true;
5119   }
5120 }
5121 
5122 
5123 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5124                                            const CallInst &I,
5125                                            MachineFunction &MF,
5126                                            unsigned Intrinsic) const {
5127   Info.flags = MachineMemOperand::MONone;
5128   Info.offset = 0;
5129 
5130   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5131   if (!IntrData) {
5132     switch (Intrinsic) {
5133     case Intrinsic::x86_aesenc128kl:
5134     case Intrinsic::x86_aesdec128kl:
5135       Info.opc = ISD::INTRINSIC_W_CHAIN;
5136       Info.ptrVal = I.getArgOperand(1);
5137       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5138       Info.align = Align(1);
5139       Info.flags |= MachineMemOperand::MOLoad;
5140       return true;
5141     case Intrinsic::x86_aesenc256kl:
5142     case Intrinsic::x86_aesdec256kl:
5143       Info.opc = ISD::INTRINSIC_W_CHAIN;
5144       Info.ptrVal = I.getArgOperand(1);
5145       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5146       Info.align = Align(1);
5147       Info.flags |= MachineMemOperand::MOLoad;
5148       return true;
5149     case Intrinsic::x86_aesencwide128kl:
5150     case Intrinsic::x86_aesdecwide128kl:
5151       Info.opc = ISD::INTRINSIC_W_CHAIN;
5152       Info.ptrVal = I.getArgOperand(0);
5153       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5154       Info.align = Align(1);
5155       Info.flags |= MachineMemOperand::MOLoad;
5156       return true;
5157     case Intrinsic::x86_aesencwide256kl:
5158     case Intrinsic::x86_aesdecwide256kl:
5159       Info.opc = ISD::INTRINSIC_W_CHAIN;
5160       Info.ptrVal = I.getArgOperand(0);
5161       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5162       Info.align = Align(1);
5163       Info.flags |= MachineMemOperand::MOLoad;
5164       return true;
5165     }
5166     return false;
5167   }
5168 
5169   switch (IntrData->Type) {
5170   case TRUNCATE_TO_MEM_VI8:
5171   case TRUNCATE_TO_MEM_VI16:
5172   case TRUNCATE_TO_MEM_VI32: {
5173     Info.opc = ISD::INTRINSIC_VOID;
5174     Info.ptrVal = I.getArgOperand(0);
5175     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
5176     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5177     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5178       ScalarVT = MVT::i8;
5179     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5180       ScalarVT = MVT::i16;
5181     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5182       ScalarVT = MVT::i32;
5183 
5184     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5185     Info.align = Align(1);
5186     Info.flags |= MachineMemOperand::MOStore;
5187     break;
5188   }
5189   case GATHER:
5190   case GATHER_AVX2: {
5191     Info.opc = ISD::INTRINSIC_W_CHAIN;
5192     Info.ptrVal = nullptr;
5193     MVT DataVT = MVT::getVT(I.getType());
5194     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5195     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5196                                 IndexVT.getVectorNumElements());
5197     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5198     Info.align = Align(1);
5199     Info.flags |= MachineMemOperand::MOLoad;
5200     break;
5201   }
5202   case SCATTER: {
5203     Info.opc = ISD::INTRINSIC_VOID;
5204     Info.ptrVal = nullptr;
5205     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5206     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5207     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5208                                 IndexVT.getVectorNumElements());
5209     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5210     Info.align = Align(1);
5211     Info.flags |= MachineMemOperand::MOStore;
5212     break;
5213   }
5214   default:
5215     return false;
5216   }
5217 
5218   return true;
5219 }
5220 
5221 /// Returns true if the target can instruction select the
5222 /// specified FP immediate natively. If false, the legalizer will
5223 /// materialize the FP immediate as a load from a constant pool.
5224 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5225                                      bool ForCodeSize) const {
5226   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5227     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5228       return true;
5229   }
5230   return false;
5231 }
5232 
5233 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5234                                               ISD::LoadExtType ExtTy,
5235                                               EVT NewVT) const {
5236   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
5237 
5238   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5239   // relocation target a movq or addq instruction: don't let the load shrink.
5240   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5241   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5242     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5243       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5244 
5245   // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5246   // those uses are extracted directly into a store, then the extract + store
5247   // can be store-folded. Therefore, it's probably not worth splitting the load.
5248   EVT VT = Load->getValueType(0);
5249   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5250     for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5251       // Skip uses of the chain value. Result 0 of the node is the load value.
5252       if (UI.getUse().getResNo() != 0)
5253         continue;
5254 
5255       // If this use is not an extract + store, it's probably worth splitting.
5256       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5257           UI->use_begin()->getOpcode() != ISD::STORE)
5258         return true;
5259     }
5260     // All non-chain uses are extract + store.
5261     return false;
5262   }
5263 
5264   return true;
5265 }
5266 
5267 /// Returns true if it is beneficial to convert a load of a constant
5268 /// to just the constant itself.
5269 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5270                                                           Type *Ty) const {
5271   assert(Ty->isIntegerTy());
5272 
5273   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5274   if (BitSize == 0 || BitSize > 64)
5275     return false;
5276   return true;
5277 }
5278 
5279 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5280   // If we are using XMM registers in the ABI and the condition of the select is
5281   // a floating-point compare and we have blendv or conditional move, then it is
5282   // cheaper to select instead of doing a cross-register move and creating a
5283   // load that depends on the compare result.
5284   bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5285   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5286 }
5287 
5288 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5289   // TODO: It might be a win to ease or lift this restriction, but the generic
5290   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5291   if (VT.isVector() && Subtarget.hasAVX512())
5292     return false;
5293 
5294   return true;
5295 }
5296 
5297 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5298                                                SDValue C) const {
5299   // TODO: We handle scalars using custom code, but generic combining could make
5300   // that unnecessary.
5301   APInt MulC;
5302   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5303     return false;
5304 
5305   // Find the type this will be legalized too. Otherwise we might prematurely
5306   // convert this to shl+add/sub and then still have to type legalize those ops.
5307   // Another choice would be to defer the decision for illegal types until
5308   // after type legalization. But constant splat vectors of i64 can't make it
5309   // through type legalization on 32-bit targets so we would need to special
5310   // case vXi64.
5311   while (getTypeAction(Context, VT) != TypeLegal)
5312     VT = getTypeToTransformTo(Context, VT);
5313 
5314   // If vector multiply is legal, assume that's faster than shl + add/sub.
5315   // TODO: Multiply is a complex op with higher latency and lower throughput in
5316   //       most implementations, so this check could be loosened based on type
5317   //       and/or a CPU attribute.
5318   if (isOperationLegal(ISD::MUL, VT))
5319     return false;
5320 
5321   // shl+add, shl+sub, shl+add+neg
5322   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5323          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5324 }
5325 
5326 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5327                                                 unsigned Index) const {
5328   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5329     return false;
5330 
5331   // Mask vectors support all subregister combinations and operations that
5332   // extract half of vector.
5333   if (ResVT.getVectorElementType() == MVT::i1)
5334     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5335                           (Index == ResVT.getVectorNumElements()));
5336 
5337   return (Index % ResVT.getVectorNumElements()) == 0;
5338 }
5339 
5340 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5341   unsigned Opc = VecOp.getOpcode();
5342 
5343   // Assume target opcodes can't be scalarized.
5344   // TODO - do we have any exceptions?
5345   if (Opc >= ISD::BUILTIN_OP_END)
5346     return false;
5347 
5348   // If the vector op is not supported, try to convert to scalar.
5349   EVT VecVT = VecOp.getValueType();
5350   if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5351     return true;
5352 
5353   // If the vector op is supported, but the scalar op is not, the transform may
5354   // not be worthwhile.
5355   EVT ScalarVT = VecVT.getScalarType();
5356   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5357 }
5358 
5359 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5360                                              bool) const {
5361   // TODO: Allow vectors?
5362   if (VT.isVector())
5363     return false;
5364   return VT.isSimple() || !isOperationExpand(Opcode, VT);
5365 }
5366 
5367 bool X86TargetLowering::isCheapToSpeculateCttz() const {
5368   // Speculate cttz only if we can directly use TZCNT.
5369   return Subtarget.hasBMI();
5370 }
5371 
5372 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5373   // Speculate ctlz only if we can directly use LZCNT.
5374   return Subtarget.hasLZCNT();
5375 }
5376 
5377 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5378                                                 const SelectionDAG &DAG,
5379                                                 const MachineMemOperand &MMO) const {
5380   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5381       BitcastVT.getVectorElementType() == MVT::i1)
5382     return false;
5383 
5384   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5385     return false;
5386 
5387   // If both types are legal vectors, it's always ok to convert them.
5388   if (LoadVT.isVector() && BitcastVT.isVector() &&
5389       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5390     return true;
5391 
5392   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5393 }
5394 
5395 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5396                                          const SelectionDAG &DAG) const {
5397   // Do not merge to float value size (128 bytes) if no implicit
5398   // float attribute is set.
5399   bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5400       Attribute::NoImplicitFloat);
5401 
5402   if (NoFloat) {
5403     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5404     return (MemVT.getSizeInBits() <= MaxIntSize);
5405   }
5406   // Make sure we don't merge greater than our preferred vector
5407   // width.
5408   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5409     return false;
5410 
5411   return true;
5412 }
5413 
5414 bool X86TargetLowering::isCtlzFast() const {
5415   return Subtarget.hasFastLZCNT();
5416 }
5417 
5418 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5419     const Instruction &AndI) const {
5420   return true;
5421 }
5422 
5423 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5424   EVT VT = Y.getValueType();
5425 
5426   if (VT.isVector())
5427     return false;
5428 
5429   if (!Subtarget.hasBMI())
5430     return false;
5431 
5432   // There are only 32-bit and 64-bit forms for 'andn'.
5433   if (VT != MVT::i32 && VT != MVT::i64)
5434     return false;
5435 
5436   return !isa<ConstantSDNode>(Y);
5437 }
5438 
5439 bool X86TargetLowering::hasAndNot(SDValue Y) const {
5440   EVT VT = Y.getValueType();
5441 
5442   if (!VT.isVector())
5443     return hasAndNotCompare(Y);
5444 
5445   // Vector.
5446 
5447   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5448     return false;
5449 
5450   if (VT == MVT::v4i32)
5451     return true;
5452 
5453   return Subtarget.hasSSE2();
5454 }
5455 
5456 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5457   return X.getValueType().isScalarInteger(); // 'bt'
5458 }
5459 
5460 bool X86TargetLowering::
5461     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5462         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5463         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5464         SelectionDAG &DAG) const {
5465   // Does baseline recommend not to perform the fold by default?
5466   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5467           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5468     return false;
5469   // For scalars this transform is always beneficial.
5470   if (X.getValueType().isScalarInteger())
5471     return true;
5472   // If all the shift amounts are identical, then transform is beneficial even
5473   // with rudimentary SSE2 shifts.
5474   if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5475     return true;
5476   // If we have AVX2 with it's powerful shift operations, then it's also good.
5477   if (Subtarget.hasAVX2())
5478     return true;
5479   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5480   return NewShiftOpcode == ISD::SHL;
5481 }
5482 
5483 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5484     const SDNode *N, CombineLevel Level) const {
5485   assert(((N->getOpcode() == ISD::SHL &&
5486            N->getOperand(0).getOpcode() == ISD::SRL) ||
5487           (N->getOpcode() == ISD::SRL &&
5488            N->getOperand(0).getOpcode() == ISD::SHL)) &&
5489          "Expected shift-shift mask");
5490   EVT VT = N->getValueType(0);
5491   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5492       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5493     // Only fold if the shift values are equal - so it folds to AND.
5494     // TODO - we should fold if either is a non-uniform vector but we don't do
5495     // the fold for non-splats yet.
5496     return N->getOperand(1) == N->getOperand(0).getOperand(1);
5497   }
5498   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5499 }
5500 
5501 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5502   EVT VT = Y.getValueType();
5503 
5504   // For vectors, we don't have a preference, but we probably want a mask.
5505   if (VT.isVector())
5506     return false;
5507 
5508   // 64-bit shifts on 32-bit targets produce really bad bloated code.
5509   if (VT == MVT::i64 && !Subtarget.is64Bit())
5510     return false;
5511 
5512   return true;
5513 }
5514 
5515 bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5516                                           SDNode *N) const {
5517   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5518       !Subtarget.isOSWindows())
5519     return false;
5520   return true;
5521 }
5522 
5523 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5524   // Any legal vector type can be splatted more efficiently than
5525   // loading/spilling from memory.
5526   return isTypeLegal(VT);
5527 }
5528 
5529 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5530   MVT VT = MVT::getIntegerVT(NumBits);
5531   if (isTypeLegal(VT))
5532     return VT;
5533 
5534   // PMOVMSKB can handle this.
5535   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5536     return MVT::v16i8;
5537 
5538   // VPMOVMSKB can handle this.
5539   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5540     return MVT::v32i8;
5541 
5542   // TODO: Allow 64-bit type for 32-bit target.
5543   // TODO: 512-bit types should be allowed, but make sure that those
5544   // cases are handled in combineVectorSizedSetCCEquality().
5545 
5546   return MVT::INVALID_SIMPLE_VALUE_TYPE;
5547 }
5548 
5549 /// Val is the undef sentinel value or equal to the specified value.
5550 static bool isUndefOrEqual(int Val, int CmpVal) {
5551   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5552 }
5553 
5554 /// Return true if every element in Mask is the undef sentinel value or equal to
5555 /// the specified value..
5556 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5557   return llvm::all_of(Mask, [CmpVal](int M) {
5558     return (M == SM_SentinelUndef) || (M == CmpVal);
5559   });
5560 }
5561 
5562 /// Val is either the undef or zero sentinel value.
5563 static bool isUndefOrZero(int Val) {
5564   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5565 }
5566 
5567 /// Return true if every element in Mask, beginning from position Pos and ending
5568 /// in Pos+Size is the undef sentinel value.
5569 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5570   return llvm::all_of(Mask.slice(Pos, Size),
5571                       [](int M) { return M == SM_SentinelUndef; });
5572 }
5573 
5574 /// Return true if the mask creates a vector whose lower half is undefined.
5575 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5576   unsigned NumElts = Mask.size();
5577   return isUndefInRange(Mask, 0, NumElts / 2);
5578 }
5579 
5580 /// Return true if the mask creates a vector whose upper half is undefined.
5581 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5582   unsigned NumElts = Mask.size();
5583   return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5584 }
5585 
5586 /// Return true if Val falls within the specified range (L, H].
5587 static bool isInRange(int Val, int Low, int Hi) {
5588   return (Val >= Low && Val < Hi);
5589 }
5590 
5591 /// Return true if the value of any element in Mask falls within the specified
5592 /// range (L, H].
5593 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5594   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5595 }
5596 
5597 /// Return true if the value of any element in Mask is the zero sentinel value.
5598 static bool isAnyZero(ArrayRef<int> Mask) {
5599   return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5600 }
5601 
5602 /// Return true if the value of any element in Mask is the zero or undef
5603 /// sentinel values.
5604 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5605   return llvm::any_of(Mask, [](int M) {
5606     return M == SM_SentinelZero || M == SM_SentinelUndef;
5607   });
5608 }
5609 
5610 /// Return true if Val is undef or if its value falls within the
5611 /// specified range (L, H].
5612 static bool isUndefOrInRange(int Val, int Low, int Hi) {
5613   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5614 }
5615 
5616 /// Return true if every element in Mask is undef or if its value
5617 /// falls within the specified range (L, H].
5618 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5619   return llvm::all_of(
5620       Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5621 }
5622 
5623 /// Return true if Val is undef, zero or if its value falls within the
5624 /// specified range (L, H].
5625 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5626   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5627 }
5628 
5629 /// Return true if every element in Mask is undef, zero or if its value
5630 /// falls within the specified range (L, H].
5631 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5632   return llvm::all_of(
5633       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5634 }
5635 
5636 /// Return true if every element in Mask, beginning
5637 /// from position Pos and ending in Pos + Size, falls within the specified
5638 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5639 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5640                                        unsigned Size, int Low, int Step = 1) {
5641   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5642     if (!isUndefOrEqual(Mask[i], Low))
5643       return false;
5644   return true;
5645 }
5646 
5647 /// Return true if every element in Mask, beginning
5648 /// from position Pos and ending in Pos+Size, falls within the specified
5649 /// sequential range (Low, Low+Size], or is undef or is zero.
5650 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5651                                              unsigned Size, int Low,
5652                                              int Step = 1) {
5653   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5654     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5655       return false;
5656   return true;
5657 }
5658 
5659 /// Return true if every element in Mask, beginning
5660 /// from position Pos and ending in Pos+Size is undef or is zero.
5661 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5662                                  unsigned Size) {
5663   return llvm::all_of(Mask.slice(Pos, Size),
5664                       [](int M) { return isUndefOrZero(M); });
5665 }
5666 
5667 /// Helper function to test whether a shuffle mask could be
5668 /// simplified by widening the elements being shuffled.
5669 ///
5670 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5671 /// leaves it in an unspecified state.
5672 ///
5673 /// NOTE: This must handle normal vector shuffle masks and *target* vector
5674 /// shuffle masks. The latter have the special property of a '-2' representing
5675 /// a zero-ed lane of a vector.
5676 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5677                                     SmallVectorImpl<int> &WidenedMask) {
5678   WidenedMask.assign(Mask.size() / 2, 0);
5679   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5680     int M0 = Mask[i];
5681     int M1 = Mask[i + 1];
5682 
5683     // If both elements are undef, its trivial.
5684     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5685       WidenedMask[i / 2] = SM_SentinelUndef;
5686       continue;
5687     }
5688 
5689     // Check for an undef mask and a mask value properly aligned to fit with
5690     // a pair of values. If we find such a case, use the non-undef mask's value.
5691     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5692       WidenedMask[i / 2] = M1 / 2;
5693       continue;
5694     }
5695     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5696       WidenedMask[i / 2] = M0 / 2;
5697       continue;
5698     }
5699 
5700     // When zeroing, we need to spread the zeroing across both lanes to widen.
5701     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5702       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5703           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5704         WidenedMask[i / 2] = SM_SentinelZero;
5705         continue;
5706       }
5707       return false;
5708     }
5709 
5710     // Finally check if the two mask values are adjacent and aligned with
5711     // a pair.
5712     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5713       WidenedMask[i / 2] = M0 / 2;
5714       continue;
5715     }
5716 
5717     // Otherwise we can't safely widen the elements used in this shuffle.
5718     return false;
5719   }
5720   assert(WidenedMask.size() == Mask.size() / 2 &&
5721          "Incorrect size of mask after widening the elements!");
5722 
5723   return true;
5724 }
5725 
5726 static bool canWidenShuffleElements(ArrayRef<int> Mask,
5727                                     const APInt &Zeroable,
5728                                     bool V2IsZero,
5729                                     SmallVectorImpl<int> &WidenedMask) {
5730   // Create an alternative mask with info about zeroable elements.
5731   // Here we do not set undef elements as zeroable.
5732   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5733   if (V2IsZero) {
5734     assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
5735     for (int i = 0, Size = Mask.size(); i != Size; ++i)
5736       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5737         ZeroableMask[i] = SM_SentinelZero;
5738   }
5739   return canWidenShuffleElements(ZeroableMask, WidenedMask);
5740 }
5741 
5742 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5743   SmallVector<int, 32> WidenedMask;
5744   return canWidenShuffleElements(Mask, WidenedMask);
5745 }
5746 
5747 // Attempt to narrow/widen shuffle mask until it matches the target number of
5748 // elements.
5749 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5750                                  SmallVectorImpl<int> &ScaledMask) {
5751   unsigned NumSrcElts = Mask.size();
5752   assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
5753          "Illegal shuffle scale factor");
5754 
5755   // Narrowing is guaranteed to work.
5756   if (NumDstElts >= NumSrcElts) {
5757     int Scale = NumDstElts / NumSrcElts;
5758     llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5759     return true;
5760   }
5761 
5762   // We have to repeat the widening until we reach the target size, but we can
5763   // split out the first widening as it sets up ScaledMask for us.
5764   if (canWidenShuffleElements(Mask, ScaledMask)) {
5765     while (ScaledMask.size() > NumDstElts) {
5766       SmallVector<int, 16> WidenedMask;
5767       if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5768         return false;
5769       ScaledMask = std::move(WidenedMask);
5770     }
5771     return true;
5772   }
5773 
5774   return false;
5775 }
5776 
5777 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
5778 bool X86::isZeroNode(SDValue Elt) {
5779   return isNullConstant(Elt) || isNullFPConstant(Elt);
5780 }
5781 
5782 // Build a vector of constants.
5783 // Use an UNDEF node if MaskElt == -1.
5784 // Split 64-bit constants in the 32-bit mode.
5785 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5786                               const SDLoc &dl, bool IsMask = false) {
5787 
5788   SmallVector<SDValue, 32>  Ops;
5789   bool Split = false;
5790 
5791   MVT ConstVecVT = VT;
5792   unsigned NumElts = VT.getVectorNumElements();
5793   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5794   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5795     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5796     Split = true;
5797   }
5798 
5799   MVT EltVT = ConstVecVT.getVectorElementType();
5800   for (unsigned i = 0; i < NumElts; ++i) {
5801     bool IsUndef = Values[i] < 0 && IsMask;
5802     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5803       DAG.getConstant(Values[i], dl, EltVT);
5804     Ops.push_back(OpNode);
5805     if (Split)
5806       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5807                     DAG.getConstant(0, dl, EltVT));
5808   }
5809   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5810   if (Split)
5811     ConstsNode = DAG.getBitcast(VT, ConstsNode);
5812   return ConstsNode;
5813 }
5814 
5815 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5816                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5817   assert(Bits.size() == Undefs.getBitWidth() &&
5818          "Unequal constant and undef arrays");
5819   SmallVector<SDValue, 32> Ops;
5820   bool Split = false;
5821 
5822   MVT ConstVecVT = VT;
5823   unsigned NumElts = VT.getVectorNumElements();
5824   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5825   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5826     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5827     Split = true;
5828   }
5829 
5830   MVT EltVT = ConstVecVT.getVectorElementType();
5831   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5832     if (Undefs[i]) {
5833       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5834       continue;
5835     }
5836     const APInt &V = Bits[i];
5837     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5838     if (Split) {
5839       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5840       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5841     } else if (EltVT == MVT::f32) {
5842       APFloat FV(APFloat::IEEEsingle(), V);
5843       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5844     } else if (EltVT == MVT::f64) {
5845       APFloat FV(APFloat::IEEEdouble(), V);
5846       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5847     } else {
5848       Ops.push_back(DAG.getConstant(V, dl, EltVT));
5849     }
5850   }
5851 
5852   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5853   return DAG.getBitcast(VT, ConstsNode);
5854 }
5855 
5856 /// Returns a vector of specified type with all zero elements.
5857 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5858                              SelectionDAG &DAG, const SDLoc &dl) {
5859   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5860           VT.getVectorElementType() == MVT::i1) &&
5861          "Unexpected vector type");
5862 
5863   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5864   // type. This ensures they get CSE'd. But if the integer type is not
5865   // available, use a floating-point +0.0 instead.
5866   SDValue Vec;
5867   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5868     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5869   } else if (VT.isFloatingPoint()) {
5870     Vec = DAG.getConstantFP(+0.0, dl, VT);
5871   } else if (VT.getVectorElementType() == MVT::i1) {
5872     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5873            "Unexpected vector type");
5874     Vec = DAG.getConstant(0, dl, VT);
5875   } else {
5876     unsigned Num32BitElts = VT.getSizeInBits() / 32;
5877     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5878   }
5879   return DAG.getBitcast(VT, Vec);
5880 }
5881 
5882 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5883                                 const SDLoc &dl, unsigned vectorWidth) {
5884   EVT VT = Vec.getValueType();
5885   EVT ElVT = VT.getVectorElementType();
5886   unsigned Factor = VT.getSizeInBits() / vectorWidth;
5887   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5888                                   VT.getVectorNumElements() / Factor);
5889 
5890   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
5891   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5892   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5893 
5894   // This is the index of the first element of the vectorWidth-bit chunk
5895   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5896   IdxVal &= ~(ElemsPerChunk - 1);
5897 
5898   // If the input is a buildvector just emit a smaller one.
5899   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5900     return DAG.getBuildVector(ResultVT, dl,
5901                               Vec->ops().slice(IdxVal, ElemsPerChunk));
5902 
5903   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5904   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5905 }
5906 
5907 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
5908 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5909 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5910 /// instructions or a simple subregister reference. Idx is an index in the
5911 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5912 /// lowering EXTRACT_VECTOR_ELT operations easier.
5913 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5914                                    SelectionDAG &DAG, const SDLoc &dl) {
5915   assert((Vec.getValueType().is256BitVector() ||
5916           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5917   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5918 }
5919 
5920 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5921 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5922                                    SelectionDAG &DAG, const SDLoc &dl) {
5923   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5924   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5925 }
5926 
5927 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5928                                SelectionDAG &DAG, const SDLoc &dl,
5929                                unsigned vectorWidth) {
5930   assert((vectorWidth == 128 || vectorWidth == 256) &&
5931          "Unsupported vector width");
5932   // Inserting UNDEF is Result
5933   if (Vec.isUndef())
5934     return Result;
5935   EVT VT = Vec.getValueType();
5936   EVT ElVT = VT.getVectorElementType();
5937   EVT ResultVT = Result.getValueType();
5938 
5939   // Insert the relevant vectorWidth bits.
5940   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5941   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5942 
5943   // This is the index of the first element of the vectorWidth-bit chunk
5944   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5945   IdxVal &= ~(ElemsPerChunk - 1);
5946 
5947   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5948   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5949 }
5950 
5951 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5952 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5953 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5954 /// simple superregister reference.  Idx is an index in the 128 bits
5955 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
5956 /// lowering INSERT_VECTOR_ELT operations easier.
5957 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5958                                   SelectionDAG &DAG, const SDLoc &dl) {
5959   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5960   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5961 }
5962 
5963 /// Widen a vector to a larger size with the same scalar type, with the new
5964 /// elements either zero or undef.
5965 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5966                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
5967                               const SDLoc &dl) {
5968   assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
5969          Vec.getValueType().getScalarType() == VT.getScalarType() &&
5970          "Unsupported vector widening type");
5971   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5972                                 : DAG.getUNDEF(VT);
5973   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5974                      DAG.getIntPtrConstant(0, dl));
5975 }
5976 
5977 /// Widen a vector to a larger size with the same scalar type, with the new
5978 /// elements either zero or undef.
5979 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5980                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
5981                               const SDLoc &dl, unsigned WideSizeInBits) {
5982   assert(Vec.getValueSizeInBits() < WideSizeInBits &&
5983          (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
5984          "Unsupported vector widening type");
5985   unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5986   MVT SVT = Vec.getSimpleValueType().getScalarType();
5987   MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5988   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5989 }
5990 
5991 // Helper function to collect subvector ops that are concatenated together,
5992 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5993 // The subvectors in Ops are guaranteed to be the same type.
5994 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5995   assert(Ops.empty() && "Expected an empty ops vector");
5996 
5997   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5998     Ops.append(N->op_begin(), N->op_end());
5999     return true;
6000   }
6001 
6002   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6003     SDValue Src = N->getOperand(0);
6004     SDValue Sub = N->getOperand(1);
6005     const APInt &Idx = N->getConstantOperandAPInt(2);
6006     EVT VT = Src.getValueType();
6007     EVT SubVT = Sub.getValueType();
6008 
6009     // TODO - Handle more general insert_subvector chains.
6010     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6011         Idx == (VT.getVectorNumElements() / 2)) {
6012       // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6013       if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6014           Src.getOperand(1).getValueType() == SubVT &&
6015           isNullConstant(Src.getOperand(2))) {
6016         Ops.push_back(Src.getOperand(1));
6017         Ops.push_back(Sub);
6018         return true;
6019       }
6020       // insert_subvector(x, extract_subvector(x, lo), hi)
6021       if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6022           Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6023         Ops.append(2, Sub);
6024         return true;
6025       }
6026     }
6027   }
6028 
6029   return false;
6030 }
6031 
6032 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6033                                                const SDLoc &dl) {
6034   EVT VT = Op.getValueType();
6035   unsigned NumElems = VT.getVectorNumElements();
6036   unsigned SizeInBits = VT.getSizeInBits();
6037   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
6038          "Can't split odd sized vector");
6039 
6040   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6041   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6042   return std::make_pair(Lo, Hi);
6043 }
6044 
6045 // Split an unary integer op into 2 half sized ops.
6046 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6047   EVT VT = Op.getValueType();
6048 
6049   // Make sure we only try to split 256/512-bit types to avoid creating
6050   // narrow vectors.
6051   assert((Op.getOperand(0).getValueType().is256BitVector() ||
6052           Op.getOperand(0).getValueType().is512BitVector()) &&
6053          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6054   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
6055              VT.getVectorNumElements() &&
6056          "Unexpected VTs!");
6057 
6058   SDLoc dl(Op);
6059 
6060   // Extract the Lo/Hi vectors
6061   SDValue Lo, Hi;
6062   std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6063 
6064   EVT LoVT, HiVT;
6065   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6066   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6067                      DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6068                      DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6069 }
6070 
6071 /// Break a binary integer operation into 2 half sized ops and then
6072 /// concatenate the result back.
6073 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6074   EVT VT = Op.getValueType();
6075 
6076   // Sanity check that all the types match.
6077   assert(Op.getOperand(0).getValueType() == VT &&
6078          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
6079   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6080 
6081   SDLoc dl(Op);
6082 
6083   // Extract the LHS Lo/Hi vectors
6084   SDValue LHS1, LHS2;
6085   std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6086 
6087   // Extract the RHS Lo/Hi vectors
6088   SDValue RHS1, RHS2;
6089   std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6090 
6091   EVT LoVT, HiVT;
6092   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6093   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6094                      DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6095                      DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6096 }
6097 
6098 // Helper for splitting operands of an operation to legal target size and
6099 // apply a function on each part.
6100 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6101 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6102 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6103 // The argument Builder is a function that will be applied on each split part:
6104 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6105 template <typename F>
6106 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6107                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6108                          F Builder, bool CheckBWI = true) {
6109   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
6110   unsigned NumSubs = 1;
6111   if ((CheckBWI && Subtarget.useBWIRegs()) ||
6112       (!CheckBWI && Subtarget.useAVX512Regs())) {
6113     if (VT.getSizeInBits() > 512) {
6114       NumSubs = VT.getSizeInBits() / 512;
6115       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
6116     }
6117   } else if (Subtarget.hasAVX2()) {
6118     if (VT.getSizeInBits() > 256) {
6119       NumSubs = VT.getSizeInBits() / 256;
6120       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
6121     }
6122   } else {
6123     if (VT.getSizeInBits() > 128) {
6124       NumSubs = VT.getSizeInBits() / 128;
6125       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
6126     }
6127   }
6128 
6129   if (NumSubs == 1)
6130     return Builder(DAG, DL, Ops);
6131 
6132   SmallVector<SDValue, 4> Subs;
6133   for (unsigned i = 0; i != NumSubs; ++i) {
6134     SmallVector<SDValue, 2> SubOps;
6135     for (SDValue Op : Ops) {
6136       EVT OpVT = Op.getValueType();
6137       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6138       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6139       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6140     }
6141     Subs.push_back(Builder(DAG, DL, SubOps));
6142   }
6143   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6144 }
6145 
6146 /// Insert i1-subvector to i1-vector.
6147 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6148                                 const X86Subtarget &Subtarget) {
6149 
6150   SDLoc dl(Op);
6151   SDValue Vec = Op.getOperand(0);
6152   SDValue SubVec = Op.getOperand(1);
6153   SDValue Idx = Op.getOperand(2);
6154   unsigned IdxVal = Op.getConstantOperandVal(2);
6155 
6156   // Inserting undef is a nop. We can just return the original vector.
6157   if (SubVec.isUndef())
6158     return Vec;
6159 
6160   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6161     return Op;
6162 
6163   MVT OpVT = Op.getSimpleValueType();
6164   unsigned NumElems = OpVT.getVectorNumElements();
6165   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6166 
6167   // Extend to natively supported kshift.
6168   MVT WideOpVT = OpVT;
6169   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6170     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6171 
6172   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6173   // if necessary.
6174   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6175     // May need to promote to a legal type.
6176     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6177                      DAG.getConstant(0, dl, WideOpVT),
6178                      SubVec, Idx);
6179     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6180   }
6181 
6182   MVT SubVecVT = SubVec.getSimpleValueType();
6183   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6184   assert(IdxVal + SubVecNumElems <= NumElems &&
6185          IdxVal % SubVecVT.getSizeInBits() == 0 &&
6186          "Unexpected index value in INSERT_SUBVECTOR");
6187 
6188   SDValue Undef = DAG.getUNDEF(WideOpVT);
6189 
6190   if (IdxVal == 0) {
6191     // Zero lower bits of the Vec
6192     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6193     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6194                       ZeroIdx);
6195     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6196     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6197     // Merge them together, SubVec should be zero extended.
6198     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6199                          DAG.getConstant(0, dl, WideOpVT),
6200                          SubVec, ZeroIdx);
6201     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6202     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6203   }
6204 
6205   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6206                        Undef, SubVec, ZeroIdx);
6207 
6208   if (Vec.isUndef()) {
6209     assert(IdxVal != 0 && "Unexpected index");
6210     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6211                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6212     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6213   }
6214 
6215   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6216     assert(IdxVal != 0 && "Unexpected index");
6217     NumElems = WideOpVT.getVectorNumElements();
6218     unsigned ShiftLeft = NumElems - SubVecNumElems;
6219     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6220     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6221                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6222     if (ShiftRight != 0)
6223       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6224                            DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6225     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6226   }
6227 
6228   // Simple case when we put subvector in the upper part
6229   if (IdxVal + SubVecNumElems == NumElems) {
6230     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6231                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6232     if (SubVecNumElems * 2 == NumElems) {
6233       // Special case, use legal zero extending insert_subvector. This allows
6234       // isel to optimize when bits are known zero.
6235       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6236       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6237                         DAG.getConstant(0, dl, WideOpVT),
6238                         Vec, ZeroIdx);
6239     } else {
6240       // Otherwise use explicit shifts to zero the bits.
6241       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6242                         Undef, Vec, ZeroIdx);
6243       NumElems = WideOpVT.getVectorNumElements();
6244       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6245       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6246       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6247     }
6248     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6249     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6250   }
6251 
6252   // Inserting into the middle is more complicated.
6253 
6254   NumElems = WideOpVT.getVectorNumElements();
6255 
6256   // Widen the vector if needed.
6257   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6258 
6259   unsigned ShiftLeft = NumElems - SubVecNumElems;
6260   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6261 
6262   // Do an optimization for the the most frequently used types.
6263   if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6264     APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6265     Mask0.flipAllBits();
6266     SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6267     SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6268     Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6269     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6270                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6271     SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6272                          DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6273     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6274 
6275     // Reduce to original width if needed.
6276     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6277   }
6278 
6279   // Clear the upper bits of the subvector and move it to its insert position.
6280   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6281                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6282   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6283                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6284 
6285   // Isolate the bits below the insertion point.
6286   unsigned LowShift = NumElems - IdxVal;
6287   SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6288                             DAG.getTargetConstant(LowShift, dl, MVT::i8));
6289   Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6290                     DAG.getTargetConstant(LowShift, dl, MVT::i8));
6291 
6292   // Isolate the bits after the last inserted bit.
6293   unsigned HighShift = IdxVal + SubVecNumElems;
6294   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6295                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
6296   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6297                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
6298 
6299   // Now OR all 3 pieces together.
6300   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6301   SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6302 
6303   // Reduce to original width if needed.
6304   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6305 }
6306 
6307 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6308                                 const SDLoc &dl) {
6309   assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
6310   EVT SubVT = V1.getValueType();
6311   EVT SubSVT = SubVT.getScalarType();
6312   unsigned SubNumElts = SubVT.getVectorNumElements();
6313   unsigned SubVectorWidth = SubVT.getSizeInBits();
6314   EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6315   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6316   return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6317 }
6318 
6319 /// Returns a vector of specified type with all bits set.
6320 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6321 /// Then bitcast to their original type, ensuring they get CSE'd.
6322 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6323   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6324          "Expected a 128/256/512-bit vector type");
6325 
6326   APInt Ones = APInt::getAllOnesValue(32);
6327   unsigned NumElts = VT.getSizeInBits() / 32;
6328   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6329   return DAG.getBitcast(VT, Vec);
6330 }
6331 
6332 // Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6333 static unsigned getOpcode_EXTEND(unsigned Opcode) {
6334   switch (Opcode) {
6335   case ISD::ANY_EXTEND:
6336   case ISD::ANY_EXTEND_VECTOR_INREG:
6337     return ISD::ANY_EXTEND;
6338   case ISD::ZERO_EXTEND:
6339   case ISD::ZERO_EXTEND_VECTOR_INREG:
6340     return ISD::ZERO_EXTEND;
6341   case ISD::SIGN_EXTEND:
6342   case ISD::SIGN_EXTEND_VECTOR_INREG:
6343     return ISD::SIGN_EXTEND;
6344   }
6345   llvm_unreachable("Unknown opcode");
6346 }
6347 
6348 // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6349 static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6350   switch (Opcode) {
6351   case ISD::ANY_EXTEND:
6352   case ISD::ANY_EXTEND_VECTOR_INREG:
6353     return ISD::ANY_EXTEND_VECTOR_INREG;
6354   case ISD::ZERO_EXTEND:
6355   case ISD::ZERO_EXTEND_VECTOR_INREG:
6356     return ISD::ZERO_EXTEND_VECTOR_INREG;
6357   case ISD::SIGN_EXTEND:
6358   case ISD::SIGN_EXTEND_VECTOR_INREG:
6359     return ISD::SIGN_EXTEND_VECTOR_INREG;
6360   }
6361   llvm_unreachable("Unknown opcode");
6362 }
6363 
6364 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6365                                       SDValue In, SelectionDAG &DAG) {
6366   EVT InVT = In.getValueType();
6367   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
6368   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
6369           ISD::ZERO_EXTEND == Opcode) &&
6370          "Unknown extension opcode");
6371 
6372   // For 256-bit vectors, we only need the lower (128-bit) input half.
6373   // For 512-bit vectors, we only need the lower input half or quarter.
6374   if (InVT.getSizeInBits() > 128) {
6375     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
6376            "Expected VTs to be the same size!");
6377     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6378     In = extractSubVector(In, 0, DAG, DL,
6379                           std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6380     InVT = In.getValueType();
6381   }
6382 
6383   if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6384     Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6385 
6386   return DAG.getNode(Opcode, DL, VT, In);
6387 }
6388 
6389 // Match (xor X, -1) -> X.
6390 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
6391 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6392 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6393   V = peekThroughBitcasts(V);
6394   if (V.getOpcode() == ISD::XOR &&
6395       ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6396     return V.getOperand(0);
6397   if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6398       (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6399     if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6400       Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6401       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6402                          Not, V.getOperand(1));
6403     }
6404   }
6405   SmallVector<SDValue, 2> CatOps;
6406   if (collectConcatOps(V.getNode(), CatOps)) {
6407     for (SDValue &CatOp : CatOps) {
6408       SDValue NotCat = IsNOT(CatOp, DAG);
6409       if (!NotCat) return SDValue();
6410       CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6411     }
6412     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6413   }
6414   return SDValue();
6415 }
6416 
6417 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6418                                    bool Lo, bool Unary) {
6419   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
6420          "Illegal vector type to unpack");
6421   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6422   int NumElts = VT.getVectorNumElements();
6423   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6424   for (int i = 0; i < NumElts; ++i) {
6425     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6426     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6427     Pos += (Unary ? 0 : NumElts * (i % 2));
6428     Pos += (Lo ? 0 : NumEltsInLane / 2);
6429     Mask.push_back(Pos);
6430   }
6431 }
6432 
6433 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6434 /// imposed by AVX and specific to the unary pattern. Example:
6435 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6436 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6437 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6438                                    bool Lo) {
6439   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6440   int NumElts = VT.getVectorNumElements();
6441   for (int i = 0; i < NumElts; ++i) {
6442     int Pos = i / 2;
6443     Pos += (Lo ? 0 : NumElts / 2);
6444     Mask.push_back(Pos);
6445   }
6446 }
6447 
6448 /// Returns a vector_shuffle node for an unpackl operation.
6449 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6450                           SDValue V1, SDValue V2) {
6451   SmallVector<int, 8> Mask;
6452   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6453   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6454 }
6455 
6456 /// Returns a vector_shuffle node for an unpackh operation.
6457 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6458                           SDValue V1, SDValue V2) {
6459   SmallVector<int, 8> Mask;
6460   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6461   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6462 }
6463 
6464 /// Return a vector_shuffle of the specified vector of zero or undef vector.
6465 /// This produces a shuffle where the low element of V2 is swizzled into the
6466 /// zero/undef vector, landing at element Idx.
6467 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
6468 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6469                                            bool IsZero,
6470                                            const X86Subtarget &Subtarget,
6471                                            SelectionDAG &DAG) {
6472   MVT VT = V2.getSimpleValueType();
6473   SDValue V1 = IsZero
6474     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6475   int NumElems = VT.getVectorNumElements();
6476   SmallVector<int, 16> MaskVec(NumElems);
6477   for (int i = 0; i != NumElems; ++i)
6478     // If this is the insertion idx, put the low elt of V2 here.
6479     MaskVec[i] = (i == Idx) ? NumElems : i;
6480   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6481 }
6482 
6483 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6484   if (Ptr.getOpcode() == X86ISD::Wrapper ||
6485       Ptr.getOpcode() == X86ISD::WrapperRIP)
6486     Ptr = Ptr.getOperand(0);
6487 
6488   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6489   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6490     return nullptr;
6491 
6492   return CNode->getConstVal();
6493 }
6494 
6495 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6496   if (!Load || !ISD::isNormalLoad(Load))
6497     return nullptr;
6498   return getTargetConstantFromBasePtr(Load->getBasePtr());
6499 }
6500 
6501 static const Constant *getTargetConstantFromNode(SDValue Op) {
6502   Op = peekThroughBitcasts(Op);
6503   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6504 }
6505 
6506 const Constant *
6507 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6508   assert(LD && "Unexpected null LoadSDNode");
6509   return getTargetConstantFromNode(LD);
6510 }
6511 
6512 // Extract raw constant bits from constant pools.
6513 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6514                                           APInt &UndefElts,
6515                                           SmallVectorImpl<APInt> &EltBits,
6516                                           bool AllowWholeUndefs = true,
6517                                           bool AllowPartialUndefs = true) {
6518   assert(EltBits.empty() && "Expected an empty EltBits vector");
6519 
6520   Op = peekThroughBitcasts(Op);
6521 
6522   EVT VT = Op.getValueType();
6523   unsigned SizeInBits = VT.getSizeInBits();
6524   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
6525   unsigned NumElts = SizeInBits / EltSizeInBits;
6526 
6527   // Bitcast a source array of element bits to the target size.
6528   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6529     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6530     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6531     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
6532            "Constant bit sizes don't match");
6533 
6534     // Don't split if we don't allow undef bits.
6535     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6536     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6537       return false;
6538 
6539     // If we're already the right size, don't bother bitcasting.
6540     if (NumSrcElts == NumElts) {
6541       UndefElts = UndefSrcElts;
6542       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6543       return true;
6544     }
6545 
6546     // Extract all the undef/constant element data and pack into single bitsets.
6547     APInt UndefBits(SizeInBits, 0);
6548     APInt MaskBits(SizeInBits, 0);
6549 
6550     for (unsigned i = 0; i != NumSrcElts; ++i) {
6551       unsigned BitOffset = i * SrcEltSizeInBits;
6552       if (UndefSrcElts[i])
6553         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6554       MaskBits.insertBits(SrcEltBits[i], BitOffset);
6555     }
6556 
6557     // Split the undef/constant single bitset data into the target elements.
6558     UndefElts = APInt(NumElts, 0);
6559     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6560 
6561     for (unsigned i = 0; i != NumElts; ++i) {
6562       unsigned BitOffset = i * EltSizeInBits;
6563       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6564 
6565       // Only treat an element as UNDEF if all bits are UNDEF.
6566       if (UndefEltBits.isAllOnesValue()) {
6567         if (!AllowWholeUndefs)
6568           return false;
6569         UndefElts.setBit(i);
6570         continue;
6571       }
6572 
6573       // If only some bits are UNDEF then treat them as zero (or bail if not
6574       // supported).
6575       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6576         return false;
6577 
6578       EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6579     }
6580     return true;
6581   };
6582 
6583   // Collect constant bits and insert into mask/undef bit masks.
6584   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6585                                 unsigned UndefBitIndex) {
6586     if (!Cst)
6587       return false;
6588     if (isa<UndefValue>(Cst)) {
6589       Undefs.setBit(UndefBitIndex);
6590       return true;
6591     }
6592     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6593       Mask = CInt->getValue();
6594       return true;
6595     }
6596     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6597       Mask = CFP->getValueAPF().bitcastToAPInt();
6598       return true;
6599     }
6600     return false;
6601   };
6602 
6603   // Handle UNDEFs.
6604   if (Op.isUndef()) {
6605     APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6606     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6607     return CastBitData(UndefSrcElts, SrcEltBits);
6608   }
6609 
6610   // Extract scalar constant bits.
6611   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6612     APInt UndefSrcElts = APInt::getNullValue(1);
6613     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6614     return CastBitData(UndefSrcElts, SrcEltBits);
6615   }
6616   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6617     APInt UndefSrcElts = APInt::getNullValue(1);
6618     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6619     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6620     return CastBitData(UndefSrcElts, SrcEltBits);
6621   }
6622 
6623   // Extract constant bits from build vector.
6624   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6625     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6626     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6627 
6628     APInt UndefSrcElts(NumSrcElts, 0);
6629     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6630     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6631       const SDValue &Src = Op.getOperand(i);
6632       if (Src.isUndef()) {
6633         UndefSrcElts.setBit(i);
6634         continue;
6635       }
6636       auto *Cst = cast<ConstantSDNode>(Src);
6637       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6638     }
6639     return CastBitData(UndefSrcElts, SrcEltBits);
6640   }
6641   if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6642     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6643     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6644 
6645     APInt UndefSrcElts(NumSrcElts, 0);
6646     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6647     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6648       const SDValue &Src = Op.getOperand(i);
6649       if (Src.isUndef()) {
6650         UndefSrcElts.setBit(i);
6651         continue;
6652       }
6653       auto *Cst = cast<ConstantFPSDNode>(Src);
6654       APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6655       SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6656     }
6657     return CastBitData(UndefSrcElts, SrcEltBits);
6658   }
6659 
6660   // Extract constant bits from constant pool vector.
6661   if (auto *Cst = getTargetConstantFromNode(Op)) {
6662     Type *CstTy = Cst->getType();
6663     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6664     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6665       return false;
6666 
6667     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6668     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6669 
6670     APInt UndefSrcElts(NumSrcElts, 0);
6671     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6672     for (unsigned i = 0; i != NumSrcElts; ++i)
6673       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6674                                UndefSrcElts, i))
6675         return false;
6676 
6677     return CastBitData(UndefSrcElts, SrcEltBits);
6678   }
6679 
6680   // Extract constant bits from a broadcasted constant pool scalar.
6681   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6682       EltSizeInBits <= VT.getScalarSizeInBits()) {
6683     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6684     if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6685       return false;
6686 
6687     SDValue Ptr = MemIntr->getBasePtr();
6688     if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6689       unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6690       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6691 
6692       APInt UndefSrcElts(NumSrcElts, 0);
6693       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6694       if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6695         if (UndefSrcElts[0])
6696           UndefSrcElts.setBits(0, NumSrcElts);
6697         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6698         return CastBitData(UndefSrcElts, SrcEltBits);
6699       }
6700     }
6701   }
6702 
6703   // Extract constant bits from a subvector broadcast.
6704   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6705     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6706     SDValue Ptr = MemIntr->getBasePtr();
6707     // The source constant may be larger than the subvector broadcast,
6708     // ensure we extract the correct subvector constants.
6709     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6710       Type *CstTy = Cst->getType();
6711       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6712       unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6713       if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6714           (SizeInBits % SubVecSizeInBits) != 0)
6715         return false;
6716       unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6717       unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6718       unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6719       APInt UndefSubElts(NumSubElts, 0);
6720       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6721                                         APInt(CstEltSizeInBits, 0));
6722       for (unsigned i = 0; i != NumSubElts; ++i) {
6723         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6724                                  UndefSubElts, i))
6725           return false;
6726         for (unsigned j = 1; j != NumSubVecs; ++j)
6727           SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6728       }
6729       UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6730                                      UndefSubElts);
6731       return CastBitData(UndefSubElts, SubEltBits);
6732     }
6733   }
6734 
6735   // Extract a rematerialized scalar constant insertion.
6736   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6737       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6738       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6739     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6740     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6741 
6742     APInt UndefSrcElts(NumSrcElts, 0);
6743     SmallVector<APInt, 64> SrcEltBits;
6744     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6745     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6746     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6747     return CastBitData(UndefSrcElts, SrcEltBits);
6748   }
6749 
6750   // Insert constant bits from a base and sub vector sources.
6751   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6752     // If bitcasts to larger elements we might lose track of undefs - don't
6753     // allow any to be safe.
6754     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6755     bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6756 
6757     APInt UndefSrcElts, UndefSubElts;
6758     SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6759     if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6760                                       UndefSubElts, EltSubBits,
6761                                       AllowWholeUndefs && AllowUndefs,
6762                                       AllowPartialUndefs && AllowUndefs) &&
6763         getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6764                                       UndefSrcElts, EltSrcBits,
6765                                       AllowWholeUndefs && AllowUndefs,
6766                                       AllowPartialUndefs && AllowUndefs)) {
6767       unsigned BaseIdx = Op.getConstantOperandVal(2);
6768       UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6769       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6770         EltSrcBits[BaseIdx + i] = EltSubBits[i];
6771       return CastBitData(UndefSrcElts, EltSrcBits);
6772     }
6773   }
6774 
6775   // Extract constant bits from a subvector's source.
6776   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6777     // TODO - support extract_subvector through bitcasts.
6778     if (EltSizeInBits != VT.getScalarSizeInBits())
6779       return false;
6780 
6781     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6782                                       UndefElts, EltBits, AllowWholeUndefs,
6783                                       AllowPartialUndefs)) {
6784       EVT SrcVT = Op.getOperand(0).getValueType();
6785       unsigned NumSrcElts = SrcVT.getVectorNumElements();
6786       unsigned NumSubElts = VT.getVectorNumElements();
6787       unsigned BaseIdx = Op.getConstantOperandVal(1);
6788       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6789       if ((BaseIdx + NumSubElts) != NumSrcElts)
6790         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6791       if (BaseIdx != 0)
6792         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6793       return true;
6794     }
6795   }
6796 
6797   // Extract constant bits from shuffle node sources.
6798   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6799     // TODO - support shuffle through bitcasts.
6800     if (EltSizeInBits != VT.getScalarSizeInBits())
6801       return false;
6802 
6803     ArrayRef<int> Mask = SVN->getMask();
6804     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6805         llvm::any_of(Mask, [](int M) { return M < 0; }))
6806       return false;
6807 
6808     APInt UndefElts0, UndefElts1;
6809     SmallVector<APInt, 32> EltBits0, EltBits1;
6810     if (isAnyInRange(Mask, 0, NumElts) &&
6811         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6812                                        UndefElts0, EltBits0, AllowWholeUndefs,
6813                                        AllowPartialUndefs))
6814       return false;
6815     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6816         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6817                                        UndefElts1, EltBits1, AllowWholeUndefs,
6818                                        AllowPartialUndefs))
6819       return false;
6820 
6821     UndefElts = APInt::getNullValue(NumElts);
6822     for (int i = 0; i != (int)NumElts; ++i) {
6823       int M = Mask[i];
6824       if (M < 0) {
6825         UndefElts.setBit(i);
6826         EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6827       } else if (M < (int)NumElts) {
6828         if (UndefElts0[M])
6829           UndefElts.setBit(i);
6830         EltBits.push_back(EltBits0[M]);
6831       } else {
6832         if (UndefElts1[M - NumElts])
6833           UndefElts.setBit(i);
6834         EltBits.push_back(EltBits1[M - NumElts]);
6835       }
6836     }
6837     return true;
6838   }
6839 
6840   return false;
6841 }
6842 
6843 namespace llvm {
6844 namespace X86 {
6845 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6846   APInt UndefElts;
6847   SmallVector<APInt, 16> EltBits;
6848   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6849                                     UndefElts, EltBits, true,
6850                                     AllowPartialUndefs)) {
6851     int SplatIndex = -1;
6852     for (int i = 0, e = EltBits.size(); i != e; ++i) {
6853       if (UndefElts[i])
6854         continue;
6855       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6856         SplatIndex = -1;
6857         break;
6858       }
6859       SplatIndex = i;
6860     }
6861     if (0 <= SplatIndex) {
6862       SplatVal = EltBits[SplatIndex];
6863       return true;
6864     }
6865   }
6866 
6867   return false;
6868 }
6869 } // namespace X86
6870 } // namespace llvm
6871 
6872 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6873                                         unsigned MaskEltSizeInBits,
6874                                         SmallVectorImpl<uint64_t> &RawMask,
6875                                         APInt &UndefElts) {
6876   // Extract the raw target constant bits.
6877   SmallVector<APInt, 64> EltBits;
6878   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6879                                      EltBits, /* AllowWholeUndefs */ true,
6880                                      /* AllowPartialUndefs */ false))
6881     return false;
6882 
6883   // Insert the extracted elements into the mask.
6884   for (const APInt &Elt : EltBits)
6885     RawMask.push_back(Elt.getZExtValue());
6886 
6887   return true;
6888 }
6889 
6890 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6891 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6892 /// Note: This ignores saturation, so inputs must be checked first.
6893 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6894                                   bool Unary, unsigned NumStages = 1) {
6895   assert(Mask.empty() && "Expected an empty shuffle mask vector");
6896   unsigned NumElts = VT.getVectorNumElements();
6897   unsigned NumLanes = VT.getSizeInBits() / 128;
6898   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6899   unsigned Offset = Unary ? 0 : NumElts;
6900   unsigned Repetitions = 1u << (NumStages - 1);
6901   unsigned Increment = 1u << NumStages;
6902   assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
6903 
6904   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6905     for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6906       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6907         Mask.push_back(Elt + (Lane * NumEltsPerLane));
6908       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6909         Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6910     }
6911   }
6912 }
6913 
6914 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
6915 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6916                                 APInt &DemandedLHS, APInt &DemandedRHS) {
6917   int NumLanes = VT.getSizeInBits() / 128;
6918   int NumElts = DemandedElts.getBitWidth();
6919   int NumInnerElts = NumElts / 2;
6920   int NumEltsPerLane = NumElts / NumLanes;
6921   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6922 
6923   DemandedLHS = APInt::getNullValue(NumInnerElts);
6924   DemandedRHS = APInt::getNullValue(NumInnerElts);
6925 
6926   // Map DemandedElts to the packed operands.
6927   for (int Lane = 0; Lane != NumLanes; ++Lane) {
6928     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6929       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6930       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6931       if (DemandedElts[OuterIdx])
6932         DemandedLHS.setBit(InnerIdx);
6933       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6934         DemandedRHS.setBit(InnerIdx);
6935     }
6936   }
6937 }
6938 
6939 // Split the demanded elts of a HADD/HSUB node between its operands.
6940 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6941                                  APInt &DemandedLHS, APInt &DemandedRHS) {
6942   int NumLanes = VT.getSizeInBits() / 128;
6943   int NumElts = DemandedElts.getBitWidth();
6944   int NumEltsPerLane = NumElts / NumLanes;
6945   int HalfEltsPerLane = NumEltsPerLane / 2;
6946 
6947   DemandedLHS = APInt::getNullValue(NumElts);
6948   DemandedRHS = APInt::getNullValue(NumElts);
6949 
6950   // Map DemandedElts to the horizontal operands.
6951   for (int Idx = 0; Idx != NumElts; ++Idx) {
6952     if (!DemandedElts[Idx])
6953       continue;
6954     int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6955     int LocalIdx = Idx % NumEltsPerLane;
6956     if (LocalIdx < HalfEltsPerLane) {
6957       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6958       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6959     } else {
6960       LocalIdx -= HalfEltsPerLane;
6961       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6962       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6963     }
6964   }
6965 }
6966 
6967 /// Calculates the shuffle mask corresponding to the target-specific opcode.
6968 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6969 /// operands in \p Ops, and returns true.
6970 /// Sets \p IsUnary to true if only one source is used. Note that this will set
6971 /// IsUnary for shuffles which use a single input multiple times, and in those
6972 /// cases it will adjust the mask to only have indices within that single input.
6973 /// It is an error to call this with non-empty Mask/Ops vectors.
6974 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6975                                  SmallVectorImpl<SDValue> &Ops,
6976                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
6977   unsigned NumElems = VT.getVectorNumElements();
6978   unsigned MaskEltSize = VT.getScalarSizeInBits();
6979   SmallVector<uint64_t, 32> RawMask;
6980   APInt RawUndefs;
6981   uint64_t ImmN;
6982 
6983   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
6984   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
6985 
6986   IsUnary = false;
6987   bool IsFakeUnary = false;
6988   switch (N->getOpcode()) {
6989   case X86ISD::BLENDI:
6990     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6991     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6992     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6993     DecodeBLENDMask(NumElems, ImmN, Mask);
6994     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6995     break;
6996   case X86ISD::SHUFP:
6997     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6998     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6999     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7000     DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7001     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7002     break;
7003   case X86ISD::INSERTPS:
7004     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7005     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7006     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7007     DecodeINSERTPSMask(ImmN, Mask);
7008     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7009     break;
7010   case X86ISD::EXTRQI:
7011     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7012     if (isa<ConstantSDNode>(N->getOperand(1)) &&
7013         isa<ConstantSDNode>(N->getOperand(2))) {
7014       int BitLen = N->getConstantOperandVal(1);
7015       int BitIdx = N->getConstantOperandVal(2);
7016       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7017       IsUnary = true;
7018     }
7019     break;
7020   case X86ISD::INSERTQI:
7021     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7022     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7023     if (isa<ConstantSDNode>(N->getOperand(2)) &&
7024         isa<ConstantSDNode>(N->getOperand(3))) {
7025       int BitLen = N->getConstantOperandVal(2);
7026       int BitIdx = N->getConstantOperandVal(3);
7027       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7028       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7029     }
7030     break;
7031   case X86ISD::UNPCKH:
7032     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7033     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7034     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7035     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7036     break;
7037   case X86ISD::UNPCKL:
7038     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7039     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7040     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7041     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7042     break;
7043   case X86ISD::MOVHLPS:
7044     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7045     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7046     DecodeMOVHLPSMask(NumElems, Mask);
7047     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7048     break;
7049   case X86ISD::MOVLHPS:
7050     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7051     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7052     DecodeMOVLHPSMask(NumElems, Mask);
7053     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7054     break;
7055   case X86ISD::VALIGN:
7056     assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
7057            "Only 32-bit and 64-bit elements are supported!");
7058     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7059     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7060     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7061     DecodeVALIGNMask(NumElems, ImmN, Mask);
7062     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7063     Ops.push_back(N->getOperand(1));
7064     Ops.push_back(N->getOperand(0));
7065     break;
7066   case X86ISD::PALIGNR:
7067     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7068     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7069     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7070     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7071     DecodePALIGNRMask(NumElems, ImmN, Mask);
7072     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7073     Ops.push_back(N->getOperand(1));
7074     Ops.push_back(N->getOperand(0));
7075     break;
7076   case X86ISD::VSHLDQ:
7077     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7078     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7079     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7080     DecodePSLLDQMask(NumElems, ImmN, Mask);
7081     IsUnary = true;
7082     break;
7083   case X86ISD::VSRLDQ:
7084     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7085     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7086     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7087     DecodePSRLDQMask(NumElems, ImmN, Mask);
7088     IsUnary = true;
7089     break;
7090   case X86ISD::PSHUFD:
7091   case X86ISD::VPERMILPI:
7092     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7093     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7094     DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7095     IsUnary = true;
7096     break;
7097   case X86ISD::PSHUFHW:
7098     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7099     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7100     DecodePSHUFHWMask(NumElems, ImmN, Mask);
7101     IsUnary = true;
7102     break;
7103   case X86ISD::PSHUFLW:
7104     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7105     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7106     DecodePSHUFLWMask(NumElems, ImmN, Mask);
7107     IsUnary = true;
7108     break;
7109   case X86ISD::VZEXT_MOVL:
7110     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7111     DecodeZeroMoveLowMask(NumElems, Mask);
7112     IsUnary = true;
7113     break;
7114   case X86ISD::VBROADCAST:
7115     // We only decode broadcasts of same-sized vectors, peeking through to
7116     // extracted subvectors is likely to cause hasOneUse issues with
7117     // SimplifyDemandedBits etc.
7118     if (N->getOperand(0).getValueType() == VT) {
7119       DecodeVectorBroadcast(NumElems, Mask);
7120       IsUnary = true;
7121       break;
7122     }
7123     return false;
7124   case X86ISD::VPERMILPV: {
7125     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7126     IsUnary = true;
7127     SDValue MaskNode = N->getOperand(1);
7128     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7129                                     RawUndefs)) {
7130       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7131       break;
7132     }
7133     return false;
7134   }
7135   case X86ISD::PSHUFB: {
7136     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7137     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7138     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7139     IsUnary = true;
7140     SDValue MaskNode = N->getOperand(1);
7141     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7142       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7143       break;
7144     }
7145     return false;
7146   }
7147   case X86ISD::VPERMI:
7148     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7149     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7150     DecodeVPERMMask(NumElems, ImmN, Mask);
7151     IsUnary = true;
7152     break;
7153   case X86ISD::MOVSS:
7154   case X86ISD::MOVSD:
7155     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7156     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7157     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7158     break;
7159   case X86ISD::VPERM2X128:
7160     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7161     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7162     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7163     DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7164     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7165     break;
7166   case X86ISD::SHUF128:
7167     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7168     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7169     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7170     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7171     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7172     break;
7173   case X86ISD::MOVSLDUP:
7174     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7175     DecodeMOVSLDUPMask(NumElems, Mask);
7176     IsUnary = true;
7177     break;
7178   case X86ISD::MOVSHDUP:
7179     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7180     DecodeMOVSHDUPMask(NumElems, Mask);
7181     IsUnary = true;
7182     break;
7183   case X86ISD::MOVDDUP:
7184     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7185     DecodeMOVDDUPMask(NumElems, Mask);
7186     IsUnary = true;
7187     break;
7188   case X86ISD::VPERMIL2: {
7189     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7190     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7191     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7192     SDValue MaskNode = N->getOperand(2);
7193     SDValue CtrlNode = N->getOperand(3);
7194     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7195       unsigned CtrlImm = CtrlOp->getZExtValue();
7196       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7197                                       RawUndefs)) {
7198         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7199                             Mask);
7200         break;
7201       }
7202     }
7203     return false;
7204   }
7205   case X86ISD::VPPERM: {
7206     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7207     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7208     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7209     SDValue MaskNode = N->getOperand(2);
7210     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7211       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7212       break;
7213     }
7214     return false;
7215   }
7216   case X86ISD::VPERMV: {
7217     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7218     IsUnary = true;
7219     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7220     Ops.push_back(N->getOperand(1));
7221     SDValue MaskNode = N->getOperand(0);
7222     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7223                                     RawUndefs)) {
7224       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7225       break;
7226     }
7227     return false;
7228   }
7229   case X86ISD::VPERMV3: {
7230     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7231     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
7232     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7233     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7234     Ops.push_back(N->getOperand(0));
7235     Ops.push_back(N->getOperand(2));
7236     SDValue MaskNode = N->getOperand(1);
7237     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7238                                     RawUndefs)) {
7239       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7240       break;
7241     }
7242     return false;
7243   }
7244   default: llvm_unreachable("unknown target shuffle node");
7245   }
7246 
7247   // Empty mask indicates the decode failed.
7248   if (Mask.empty())
7249     return false;
7250 
7251   // Check if we're getting a shuffle mask with zero'd elements.
7252   if (!AllowSentinelZero && isAnyZero(Mask))
7253     return false;
7254 
7255   // If we have a fake unary shuffle, the shuffle mask is spread across two
7256   // inputs that are actually the same node. Re-map the mask to always point
7257   // into the first input.
7258   if (IsFakeUnary)
7259     for (int &M : Mask)
7260       if (M >= (int)Mask.size())
7261         M -= Mask.size();
7262 
7263   // If we didn't already add operands in the opcode-specific code, default to
7264   // adding 1 or 2 operands starting at 0.
7265   if (Ops.empty()) {
7266     Ops.push_back(N->getOperand(0));
7267     if (!IsUnary || IsFakeUnary)
7268       Ops.push_back(N->getOperand(1));
7269   }
7270 
7271   return true;
7272 }
7273 
7274 // Wrapper for getTargetShuffleMask with InUnary;
7275 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7276                                  SmallVectorImpl<SDValue> &Ops,
7277                                  SmallVectorImpl<int> &Mask) {
7278   bool IsUnary;
7279   return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7280 }
7281 
7282 /// Compute whether each element of a shuffle is zeroable.
7283 ///
7284 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7285 /// Either it is an undef element in the shuffle mask, the element of the input
7286 /// referenced is undef, or the element of the input referenced is known to be
7287 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7288 /// as many lanes with this technique as possible to simplify the remaining
7289 /// shuffle.
7290 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7291                                            SDValue V1, SDValue V2,
7292                                            APInt &KnownUndef, APInt &KnownZero) {
7293   int Size = Mask.size();
7294   KnownUndef = KnownZero = APInt::getNullValue(Size);
7295 
7296   V1 = peekThroughBitcasts(V1);
7297   V2 = peekThroughBitcasts(V2);
7298 
7299   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7300   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7301 
7302   int VectorSizeInBits = V1.getValueSizeInBits();
7303   int ScalarSizeInBits = VectorSizeInBits / Size;
7304   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7305 
7306   for (int i = 0; i < Size; ++i) {
7307     int M = Mask[i];
7308     // Handle the easy cases.
7309     if (M < 0) {
7310       KnownUndef.setBit(i);
7311       continue;
7312     }
7313     if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7314       KnownZero.setBit(i);
7315       continue;
7316     }
7317 
7318     // Determine shuffle input and normalize the mask.
7319     SDValue V = M < Size ? V1 : V2;
7320     M %= Size;
7321 
7322     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7323     if (V.getOpcode() != ISD::BUILD_VECTOR)
7324       continue;
7325 
7326     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7327     // the (larger) source element must be UNDEF/ZERO.
7328     if ((Size % V.getNumOperands()) == 0) {
7329       int Scale = Size / V->getNumOperands();
7330       SDValue Op = V.getOperand(M / Scale);
7331       if (Op.isUndef())
7332         KnownUndef.setBit(i);
7333       if (X86::isZeroNode(Op))
7334         KnownZero.setBit(i);
7335       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7336         APInt Val = Cst->getAPIntValue();
7337         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7338         if (Val == 0)
7339           KnownZero.setBit(i);
7340       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7341         APInt Val = Cst->getValueAPF().bitcastToAPInt();
7342         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7343         if (Val == 0)
7344           KnownZero.setBit(i);
7345       }
7346       continue;
7347     }
7348 
7349     // If the BUILD_VECTOR has more elements then all the (smaller) source
7350     // elements must be UNDEF or ZERO.
7351     if ((V.getNumOperands() % Size) == 0) {
7352       int Scale = V->getNumOperands() / Size;
7353       bool AllUndef = true;
7354       bool AllZero = true;
7355       for (int j = 0; j < Scale; ++j) {
7356         SDValue Op = V.getOperand((M * Scale) + j);
7357         AllUndef &= Op.isUndef();
7358         AllZero &= X86::isZeroNode(Op);
7359       }
7360       if (AllUndef)
7361         KnownUndef.setBit(i);
7362       if (AllZero)
7363         KnownZero.setBit(i);
7364       continue;
7365     }
7366   }
7367 }
7368 
7369 /// Decode a target shuffle mask and inputs and see if any values are
7370 /// known to be undef or zero from their inputs.
7371 /// Returns true if the target shuffle mask was decoded.
7372 /// FIXME: Merge this with computeZeroableShuffleElements?
7373 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7374                                          SmallVectorImpl<SDValue> &Ops,
7375                                          APInt &KnownUndef, APInt &KnownZero) {
7376   bool IsUnary;
7377   if (!isTargetShuffle(N.getOpcode()))
7378     return false;
7379 
7380   MVT VT = N.getSimpleValueType();
7381   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7382     return false;
7383 
7384   int Size = Mask.size();
7385   SDValue V1 = Ops[0];
7386   SDValue V2 = IsUnary ? V1 : Ops[1];
7387   KnownUndef = KnownZero = APInt::getNullValue(Size);
7388 
7389   V1 = peekThroughBitcasts(V1);
7390   V2 = peekThroughBitcasts(V2);
7391 
7392   assert((VT.getSizeInBits() % Size) == 0 &&
7393          "Illegal split of shuffle value type");
7394   unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7395 
7396   // Extract known constant input data.
7397   APInt UndefSrcElts[2];
7398   SmallVector<APInt, 32> SrcEltBits[2];
7399   bool IsSrcConstant[2] = {
7400       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7401                                     SrcEltBits[0], true, false),
7402       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7403                                     SrcEltBits[1], true, false)};
7404 
7405   for (int i = 0; i < Size; ++i) {
7406     int M = Mask[i];
7407 
7408     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7409     if (M < 0) {
7410       assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
7411       if (SM_SentinelUndef == M)
7412         KnownUndef.setBit(i);
7413       if (SM_SentinelZero == M)
7414         KnownZero.setBit(i);
7415       continue;
7416     }
7417 
7418     // Determine shuffle input and normalize the mask.
7419     unsigned SrcIdx = M / Size;
7420     SDValue V = M < Size ? V1 : V2;
7421     M %= Size;
7422 
7423     // We are referencing an UNDEF input.
7424     if (V.isUndef()) {
7425       KnownUndef.setBit(i);
7426       continue;
7427     }
7428 
7429     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7430     // TODO: We currently only set UNDEF for integer types - floats use the same
7431     // registers as vectors and many of the scalar folded loads rely on the
7432     // SCALAR_TO_VECTOR pattern.
7433     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7434         (Size % V.getValueType().getVectorNumElements()) == 0) {
7435       int Scale = Size / V.getValueType().getVectorNumElements();
7436       int Idx = M / Scale;
7437       if (Idx != 0 && !VT.isFloatingPoint())
7438         KnownUndef.setBit(i);
7439       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7440         KnownZero.setBit(i);
7441       continue;
7442     }
7443 
7444     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7445     // base vectors.
7446     if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7447       SDValue Vec = V.getOperand(0);
7448       int NumVecElts = Vec.getValueType().getVectorNumElements();
7449       if (Vec.isUndef() && Size == NumVecElts) {
7450         int Idx = V.getConstantOperandVal(2);
7451         int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7452         if (M < Idx || (Idx + NumSubElts) <= M)
7453           KnownUndef.setBit(i);
7454       }
7455       continue;
7456     }
7457 
7458     // Attempt to extract from the source's constant bits.
7459     if (IsSrcConstant[SrcIdx]) {
7460       if (UndefSrcElts[SrcIdx][M])
7461         KnownUndef.setBit(i);
7462       else if (SrcEltBits[SrcIdx][M] == 0)
7463         KnownZero.setBit(i);
7464     }
7465   }
7466 
7467   assert(VT.getVectorNumElements() == (unsigned)Size &&
7468          "Different mask size from vector size!");
7469   return true;
7470 }
7471 
7472 // Replace target shuffle mask elements with known undef/zero sentinels.
7473 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7474                                               const APInt &KnownUndef,
7475                                               const APInt &KnownZero,
7476                                               bool ResolveKnownZeros= true) {
7477   unsigned NumElts = Mask.size();
7478   assert(KnownUndef.getBitWidth() == NumElts &&
7479          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
7480 
7481   for (unsigned i = 0; i != NumElts; ++i) {
7482     if (KnownUndef[i])
7483       Mask[i] = SM_SentinelUndef;
7484     else if (ResolveKnownZeros && KnownZero[i])
7485       Mask[i] = SM_SentinelZero;
7486   }
7487 }
7488 
7489 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7490 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7491                                               APInt &KnownUndef,
7492                                               APInt &KnownZero) {
7493   unsigned NumElts = Mask.size();
7494   KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7495 
7496   for (unsigned i = 0; i != NumElts; ++i) {
7497     int M = Mask[i];
7498     if (SM_SentinelUndef == M)
7499       KnownUndef.setBit(i);
7500     if (SM_SentinelZero == M)
7501       KnownZero.setBit(i);
7502   }
7503 }
7504 
7505 // Forward declaration (for getFauxShuffleMask recursive check).
7506 // TODO: Use DemandedElts variant.
7507 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7508                                    SmallVectorImpl<int> &Mask,
7509                                    const SelectionDAG &DAG, unsigned Depth,
7510                                    bool ResolveKnownElts);
7511 
7512 // Attempt to decode ops that could be represented as a shuffle mask.
7513 // The decoded shuffle mask may contain a different number of elements to the
7514 // destination value type.
7515 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7516                                SmallVectorImpl<int> &Mask,
7517                                SmallVectorImpl<SDValue> &Ops,
7518                                const SelectionDAG &DAG, unsigned Depth,
7519                                bool ResolveKnownElts) {
7520   Mask.clear();
7521   Ops.clear();
7522 
7523   MVT VT = N.getSimpleValueType();
7524   unsigned NumElts = VT.getVectorNumElements();
7525   unsigned NumSizeInBits = VT.getSizeInBits();
7526   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7527   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7528     return false;
7529   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
7530   unsigned NumSizeInBytes = NumSizeInBits / 8;
7531   unsigned NumBytesPerElt = NumBitsPerElt / 8;
7532 
7533   unsigned Opcode = N.getOpcode();
7534   switch (Opcode) {
7535   case ISD::VECTOR_SHUFFLE: {
7536     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7537     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7538     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7539       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7540       Ops.push_back(N.getOperand(0));
7541       Ops.push_back(N.getOperand(1));
7542       return true;
7543     }
7544     return false;
7545   }
7546   case ISD::AND:
7547   case X86ISD::ANDNP: {
7548     // Attempt to decode as a per-byte mask.
7549     APInt UndefElts;
7550     SmallVector<APInt, 32> EltBits;
7551     SDValue N0 = N.getOperand(0);
7552     SDValue N1 = N.getOperand(1);
7553     bool IsAndN = (X86ISD::ANDNP == Opcode);
7554     uint64_t ZeroMask = IsAndN ? 255 : 0;
7555     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7556       return false;
7557     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7558       if (UndefElts[i]) {
7559         Mask.push_back(SM_SentinelUndef);
7560         continue;
7561       }
7562       const APInt &ByteBits = EltBits[i];
7563       if (ByteBits != 0 && ByteBits != 255)
7564         return false;
7565       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7566     }
7567     Ops.push_back(IsAndN ? N1 : N0);
7568     return true;
7569   }
7570   case ISD::OR: {
7571     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7572     // is a valid shuffle index.
7573     SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7574     SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7575     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7576       return false;
7577     SmallVector<int, 64> SrcMask0, SrcMask1;
7578     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7579     if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7580                                 true) ||
7581         !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7582                                 true))
7583       return false;
7584 
7585     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7586     SmallVector<int, 64> Mask0, Mask1;
7587     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7588     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7589     for (int i = 0; i != (int)MaskSize; ++i) {
7590       // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7591       // loops converting between OR and BLEND shuffles due to
7592       // canWidenShuffleElements merging away undef elements, meaning we
7593       // fail to recognise the OR as the undef element isn't known zero.
7594       if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7595         Mask.push_back(SM_SentinelZero);
7596       else if (Mask1[i] == SM_SentinelZero)
7597         Mask.push_back(i);
7598       else if (Mask0[i] == SM_SentinelZero)
7599         Mask.push_back(i + MaskSize);
7600       else
7601         return false;
7602     }
7603     Ops.push_back(N0);
7604     Ops.push_back(N1);
7605     return true;
7606   }
7607   case ISD::INSERT_SUBVECTOR: {
7608     SDValue Src = N.getOperand(0);
7609     SDValue Sub = N.getOperand(1);
7610     EVT SubVT = Sub.getValueType();
7611     unsigned NumSubElts = SubVT.getVectorNumElements();
7612     if (!N->isOnlyUserOf(Sub.getNode()))
7613       return false;
7614     uint64_t InsertIdx = N.getConstantOperandVal(2);
7615     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7616     if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7617         Sub.getOperand(0).getValueType() == VT) {
7618       uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7619       for (int i = 0; i != (int)NumElts; ++i)
7620         Mask.push_back(i);
7621       for (int i = 0; i != (int)NumSubElts; ++i)
7622         Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7623       Ops.push_back(Src);
7624       Ops.push_back(Sub.getOperand(0));
7625       return true;
7626     }
7627     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7628     SmallVector<int, 64> SubMask;
7629     SmallVector<SDValue, 2> SubInputs;
7630     if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7631                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
7632       return false;
7633 
7634     // Subvector shuffle inputs must not be larger than the subvector.
7635     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7636           return SubVT.getFixedSizeInBits() <
7637                  SubInput.getValueSizeInBits().getFixedSize();
7638         }))
7639       return false;
7640 
7641     if (SubMask.size() != NumSubElts) {
7642       assert(((SubMask.size() % NumSubElts) == 0 ||
7643               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
7644       if ((NumSubElts % SubMask.size()) == 0) {
7645         int Scale = NumSubElts / SubMask.size();
7646         SmallVector<int,64> ScaledSubMask;
7647         narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7648         SubMask = ScaledSubMask;
7649       } else {
7650         int Scale = SubMask.size() / NumSubElts;
7651         NumSubElts = SubMask.size();
7652         NumElts *= Scale;
7653         InsertIdx *= Scale;
7654       }
7655     }
7656     Ops.push_back(Src);
7657     Ops.append(SubInputs.begin(), SubInputs.end());
7658     if (ISD::isBuildVectorAllZeros(Src.getNode()))
7659       Mask.append(NumElts, SM_SentinelZero);
7660     else
7661       for (int i = 0; i != (int)NumElts; ++i)
7662         Mask.push_back(i);
7663     for (int i = 0; i != (int)NumSubElts; ++i) {
7664       int M = SubMask[i];
7665       if (0 <= M) {
7666         int InputIdx = M / NumSubElts;
7667         M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7668       }
7669       Mask[i + InsertIdx] = M;
7670     }
7671     return true;
7672   }
7673   case X86ISD::PINSRB:
7674   case X86ISD::PINSRW:
7675   case ISD::SCALAR_TO_VECTOR:
7676   case ISD::INSERT_VECTOR_ELT: {
7677     // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7678     // vector, for matching src/dst vector types.
7679     SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7680 
7681     unsigned DstIdx = 0;
7682     if (Opcode != ISD::SCALAR_TO_VECTOR) {
7683       // Check we have an in-range constant insertion index.
7684       if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7685           N.getConstantOperandAPInt(2).uge(NumElts))
7686         return false;
7687       DstIdx = N.getConstantOperandVal(2);
7688 
7689       // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7690       if (X86::isZeroNode(Scl)) {
7691         Ops.push_back(N.getOperand(0));
7692         for (unsigned i = 0; i != NumElts; ++i)
7693           Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7694         return true;
7695       }
7696     }
7697 
7698     // Peek through trunc/aext/zext.
7699     // TODO: aext shouldn't require SM_SentinelZero padding.
7700     // TODO: handle shift of scalars.
7701     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7702     while (Scl.getOpcode() == ISD::TRUNCATE ||
7703            Scl.getOpcode() == ISD::ANY_EXTEND ||
7704            Scl.getOpcode() == ISD::ZERO_EXTEND) {
7705       Scl = Scl.getOperand(0);
7706       MinBitsPerElt =
7707           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7708     }
7709     if ((MinBitsPerElt % 8) != 0)
7710       return false;
7711 
7712     // Attempt to find the source vector the scalar was extracted from.
7713     SDValue SrcExtract;
7714     if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7715          Scl.getOpcode() == X86ISD::PEXTRW ||
7716          Scl.getOpcode() == X86ISD::PEXTRB) &&
7717         Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7718       SrcExtract = Scl;
7719     }
7720     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7721       return false;
7722 
7723     SDValue SrcVec = SrcExtract.getOperand(0);
7724     EVT SrcVT = SrcVec.getValueType();
7725     if (!SrcVT.getScalarType().isByteSized())
7726       return false;
7727     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7728     unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7729     unsigned DstByte = DstIdx * NumBytesPerElt;
7730     MinBitsPerElt =
7731         std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7732 
7733     // Create 'identity' byte level shuffle mask and then add inserted bytes.
7734     if (Opcode == ISD::SCALAR_TO_VECTOR) {
7735       Ops.push_back(SrcVec);
7736       Mask.append(NumSizeInBytes, SM_SentinelUndef);
7737     } else {
7738       Ops.push_back(SrcVec);
7739       Ops.push_back(N.getOperand(0));
7740       for (int i = 0; i != (int)NumSizeInBytes; ++i)
7741         Mask.push_back(NumSizeInBytes + i);
7742     }
7743 
7744     unsigned MinBytesPerElts = MinBitsPerElt / 8;
7745     MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7746     for (unsigned i = 0; i != MinBytesPerElts; ++i)
7747       Mask[DstByte + i] = SrcByte + i;
7748     for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7749       Mask[DstByte + i] = SM_SentinelZero;
7750     return true;
7751   }
7752   case X86ISD::PACKSS:
7753   case X86ISD::PACKUS: {
7754     SDValue N0 = N.getOperand(0);
7755     SDValue N1 = N.getOperand(1);
7756     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
7757            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
7758            "Unexpected input value type");
7759 
7760     APInt EltsLHS, EltsRHS;
7761     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7762 
7763     // If we know input saturation won't happen (or we don't care for particular
7764     // lanes), we can treat this as a truncation shuffle.
7765     bool Offset0 = false, Offset1 = false;
7766     if (Opcode == X86ISD::PACKSS) {
7767       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7768            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7769           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7770            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7771         return false;
7772       // We can't easily fold ASHR into a shuffle, but if it was feeding a
7773       // PACKSS then it was likely being used for sign-extension for a
7774       // truncation, so just peek through and adjust the mask accordingly.
7775       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7776           N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7777         Offset0 = true;
7778         N0 = N0.getOperand(0);
7779       }
7780       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7781           N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7782         Offset1 = true;
7783         N1 = N1.getOperand(0);
7784       }
7785     } else {
7786       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7787       if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7788            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7789           (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7790            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7791         return false;
7792     }
7793 
7794     bool IsUnary = (N0 == N1);
7795 
7796     Ops.push_back(N0);
7797     if (!IsUnary)
7798       Ops.push_back(N1);
7799 
7800     createPackShuffleMask(VT, Mask, IsUnary);
7801 
7802     if (Offset0 || Offset1) {
7803       for (int &M : Mask)
7804         if ((Offset0 && isInRange(M, 0, NumElts)) ||
7805             (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7806           ++M;
7807     }
7808     return true;
7809   }
7810   case X86ISD::VTRUNC: {
7811     SDValue Src = N.getOperand(0);
7812     EVT SrcVT = Src.getValueType();
7813     // Truncated source must be a simple vector.
7814     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7815         (SrcVT.getScalarSizeInBits() % 8) != 0)
7816       return false;
7817     unsigned NumSrcElts = SrcVT.getVectorNumElements();
7818     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7819     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7820     assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
7821     for (unsigned i = 0; i != NumSrcElts; ++i)
7822       Mask.push_back(i * Scale);
7823     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7824     Ops.push_back(Src);
7825     return true;
7826   }
7827   case X86ISD::VSHLI:
7828   case X86ISD::VSRLI: {
7829     uint64_t ShiftVal = N.getConstantOperandVal(1);
7830     // Out of range bit shifts are guaranteed to be zero.
7831     if (NumBitsPerElt <= ShiftVal) {
7832       Mask.append(NumElts, SM_SentinelZero);
7833       return true;
7834     }
7835 
7836     // We can only decode 'whole byte' bit shifts as shuffles.
7837     if ((ShiftVal % 8) != 0)
7838       break;
7839 
7840     uint64_t ByteShift = ShiftVal / 8;
7841     Ops.push_back(N.getOperand(0));
7842 
7843     // Clear mask to all zeros and insert the shifted byte indices.
7844     Mask.append(NumSizeInBytes, SM_SentinelZero);
7845 
7846     if (X86ISD::VSHLI == Opcode) {
7847       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7848         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7849           Mask[i + j] = i + j - ByteShift;
7850     } else {
7851       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7852         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7853           Mask[i + j - ByteShift] = i + j;
7854     }
7855     return true;
7856   }
7857   case X86ISD::VROTLI:
7858   case X86ISD::VROTRI: {
7859     // We can only decode 'whole byte' bit rotates as shuffles.
7860     uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7861     if ((RotateVal % 8) != 0)
7862       return false;
7863     Ops.push_back(N.getOperand(0));
7864     int Offset = RotateVal / 8;
7865     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7866     for (int i = 0; i != (int)NumElts; ++i) {
7867       int BaseIdx = i * NumBytesPerElt;
7868       for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7869         Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7870       }
7871     }
7872     return true;
7873   }
7874   case X86ISD::VBROADCAST: {
7875     SDValue Src = N.getOperand(0);
7876     if (!Src.getSimpleValueType().isVector()) {
7877       if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7878           !isNullConstant(Src.getOperand(1)) ||
7879           Src.getOperand(0).getValueType().getScalarType() !=
7880               VT.getScalarType())
7881         return false;
7882       Src = Src.getOperand(0);
7883     }
7884     Ops.push_back(Src);
7885     Mask.append(NumElts, 0);
7886     return true;
7887   }
7888   case ISD::ZERO_EXTEND:
7889   case ISD::ANY_EXTEND:
7890   case ISD::ZERO_EXTEND_VECTOR_INREG:
7891   case ISD::ANY_EXTEND_VECTOR_INREG: {
7892     SDValue Src = N.getOperand(0);
7893     EVT SrcVT = Src.getValueType();
7894 
7895     // Extended source must be a simple vector.
7896     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7897         (SrcVT.getScalarSizeInBits() % 8) != 0)
7898       return false;
7899 
7900     bool IsAnyExtend =
7901         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7902     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7903                          IsAnyExtend, Mask);
7904     Ops.push_back(Src);
7905     return true;
7906   }
7907   }
7908 
7909   return false;
7910 }
7911 
7912 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7913 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7914                                               SmallVectorImpl<int> &Mask) {
7915   int MaskWidth = Mask.size();
7916   SmallVector<SDValue, 16> UsedInputs;
7917   for (int i = 0, e = Inputs.size(); i < e; ++i) {
7918     int lo = UsedInputs.size() * MaskWidth;
7919     int hi = lo + MaskWidth;
7920 
7921     // Strip UNDEF input usage.
7922     if (Inputs[i].isUndef())
7923       for (int &M : Mask)
7924         if ((lo <= M) && (M < hi))
7925           M = SM_SentinelUndef;
7926 
7927     // Check for unused inputs.
7928     if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7929       for (int &M : Mask)
7930         if (lo <= M)
7931           M -= MaskWidth;
7932       continue;
7933     }
7934 
7935     // Check for repeated inputs.
7936     bool IsRepeat = false;
7937     for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7938       if (UsedInputs[j] != Inputs[i])
7939         continue;
7940       for (int &M : Mask)
7941         if (lo <= M)
7942           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7943       IsRepeat = true;
7944       break;
7945     }
7946     if (IsRepeat)
7947       continue;
7948 
7949     UsedInputs.push_back(Inputs[i]);
7950   }
7951   Inputs = UsedInputs;
7952 }
7953 
7954 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7955 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7956 /// Returns true if the target shuffle mask was decoded.
7957 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7958                                    SmallVectorImpl<SDValue> &Inputs,
7959                                    SmallVectorImpl<int> &Mask,
7960                                    APInt &KnownUndef, APInt &KnownZero,
7961                                    const SelectionDAG &DAG, unsigned Depth,
7962                                    bool ResolveKnownElts) {
7963   EVT VT = Op.getValueType();
7964   if (!VT.isSimple() || !VT.isVector())
7965     return false;
7966 
7967   if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7968     if (ResolveKnownElts)
7969       resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7970     return true;
7971   }
7972   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7973                          ResolveKnownElts)) {
7974     resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7975     return true;
7976   }
7977   return false;
7978 }
7979 
7980 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7981                                    SmallVectorImpl<int> &Mask,
7982                                    const SelectionDAG &DAG, unsigned Depth = 0,
7983                                    bool ResolveKnownElts = true) {
7984   EVT VT = Op.getValueType();
7985   if (!VT.isSimple() || !VT.isVector())
7986     return false;
7987 
7988   APInt KnownUndef, KnownZero;
7989   unsigned NumElts = Op.getValueType().getVectorNumElements();
7990   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7991   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7992                                 KnownZero, DAG, Depth, ResolveKnownElts);
7993 }
7994 
7995 // Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7996 static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7997                                  EVT MemVT, MemSDNode *Mem, unsigned Offset,
7998                                  SelectionDAG &DAG) {
7999   assert((Opcode == X86ISD::VBROADCAST_LOAD ||
8000           Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
8001          "Unknown broadcast load type");
8002 
8003   // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8004   if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8005     return SDValue();
8006 
8007   SDValue Ptr =
8008       DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8009   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8010   SDValue Ops[] = {Mem->getChain(), Ptr};
8011   SDValue BcstLd = DAG.getMemIntrinsicNode(
8012       Opcode, DL, Tys, Ops, MemVT,
8013       DAG.getMachineFunction().getMachineMemOperand(
8014           Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8015   DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8016   return BcstLd;
8017 }
8018 
8019 /// Returns the scalar element that will make up the i'th
8020 /// element of the result of the vector shuffle.
8021 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8022                                    SelectionDAG &DAG, unsigned Depth) {
8023   if (Depth >= SelectionDAG::MaxRecursionDepth)
8024     return SDValue(); // Limit search depth.
8025 
8026   EVT VT = Op.getValueType();
8027   unsigned Opcode = Op.getOpcode();
8028   unsigned NumElems = VT.getVectorNumElements();
8029 
8030   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8031   if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8032     int Elt = SV->getMaskElt(Index);
8033 
8034     if (Elt < 0)
8035       return DAG.getUNDEF(VT.getVectorElementType());
8036 
8037     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8038     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8039   }
8040 
8041   // Recurse into target specific vector shuffles to find scalars.
8042   if (isTargetShuffle(Opcode)) {
8043     MVT ShufVT = VT.getSimpleVT();
8044     MVT ShufSVT = ShufVT.getVectorElementType();
8045     int NumElems = (int)ShufVT.getVectorNumElements();
8046     SmallVector<int, 16> ShuffleMask;
8047     SmallVector<SDValue, 16> ShuffleOps;
8048     if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8049                               ShuffleMask))
8050       return SDValue();
8051 
8052     int Elt = ShuffleMask[Index];
8053     if (Elt == SM_SentinelZero)
8054       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8055                                  : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8056     if (Elt == SM_SentinelUndef)
8057       return DAG.getUNDEF(ShufSVT);
8058 
8059     assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
8060     SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8061     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8062   }
8063 
8064   // Recurse into insert_subvector base/sub vector to find scalars.
8065   if (Opcode == ISD::INSERT_SUBVECTOR) {
8066     SDValue Vec = Op.getOperand(0);
8067     SDValue Sub = Op.getOperand(1);
8068     uint64_t SubIdx = Op.getConstantOperandVal(2);
8069     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8070 
8071     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8072       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8073     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8074   }
8075 
8076   // Recurse into concat_vectors sub vector to find scalars.
8077   if (Opcode == ISD::CONCAT_VECTORS) {
8078     EVT SubVT = Op.getOperand(0).getValueType();
8079     unsigned NumSubElts = SubVT.getVectorNumElements();
8080     uint64_t SubIdx = Index / NumSubElts;
8081     uint64_t SubElt = Index % NumSubElts;
8082     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8083   }
8084 
8085   // Recurse into extract_subvector src vector to find scalars.
8086   if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8087     SDValue Src = Op.getOperand(0);
8088     uint64_t SrcIdx = Op.getConstantOperandVal(1);
8089     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8090   }
8091 
8092   // We only peek through bitcasts of the same vector width.
8093   if (Opcode == ISD::BITCAST) {
8094     SDValue Src = Op.getOperand(0);
8095     EVT SrcVT = Src.getValueType();
8096     if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8097       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8098     return SDValue();
8099   }
8100 
8101   // Actual nodes that may contain scalar elements
8102 
8103   // For insert_vector_elt - either return the index matching scalar or recurse
8104   // into the base vector.
8105   if (Opcode == ISD::INSERT_VECTOR_ELT &&
8106       isa<ConstantSDNode>(Op.getOperand(2))) {
8107     if (Op.getConstantOperandAPInt(2) == Index)
8108       return Op.getOperand(1);
8109     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8110   }
8111 
8112   if (Opcode == ISD::SCALAR_TO_VECTOR)
8113     return (Index == 0) ? Op.getOperand(0)
8114                         : DAG.getUNDEF(VT.getVectorElementType());
8115 
8116   if (Opcode == ISD::BUILD_VECTOR)
8117     return Op.getOperand(Index);
8118 
8119   return SDValue();
8120 }
8121 
8122 // Use PINSRB/PINSRW/PINSRD to create a build vector.
8123 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8124                                         unsigned NumNonZero, unsigned NumZero,
8125                                         SelectionDAG &DAG,
8126                                         const X86Subtarget &Subtarget) {
8127   MVT VT = Op.getSimpleValueType();
8128   unsigned NumElts = VT.getVectorNumElements();
8129   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
8130           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
8131          "Illegal vector insertion");
8132 
8133   SDLoc dl(Op);
8134   SDValue V;
8135   bool First = true;
8136 
8137   for (unsigned i = 0; i < NumElts; ++i) {
8138     bool IsNonZero = NonZeroMask[i];
8139     if (!IsNonZero)
8140       continue;
8141 
8142     // If the build vector contains zeros or our first insertion is not the
8143     // first index then insert into zero vector to break any register
8144     // dependency else use SCALAR_TO_VECTOR.
8145     if (First) {
8146       First = false;
8147       if (NumZero || 0 != i)
8148         V = getZeroVector(VT, Subtarget, DAG, dl);
8149       else {
8150         assert(0 == i && "Expected insertion into zero-index");
8151         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8152         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8153         V = DAG.getBitcast(VT, V);
8154         continue;
8155       }
8156     }
8157     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8158                     DAG.getIntPtrConstant(i, dl));
8159   }
8160 
8161   return V;
8162 }
8163 
8164 /// Custom lower build_vector of v16i8.
8165 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8166                                      unsigned NumNonZero, unsigned NumZero,
8167                                      SelectionDAG &DAG,
8168                                      const X86Subtarget &Subtarget) {
8169   if (NumNonZero > 8 && !Subtarget.hasSSE41())
8170     return SDValue();
8171 
8172   // SSE4.1 - use PINSRB to insert each byte directly.
8173   if (Subtarget.hasSSE41())
8174     return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8175                                     Subtarget);
8176 
8177   SDLoc dl(Op);
8178   SDValue V;
8179 
8180   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8181   for (unsigned i = 0; i < 16; i += 2) {
8182     bool ThisIsNonZero = NonZeroMask[i];
8183     bool NextIsNonZero = NonZeroMask[i + 1];
8184     if (!ThisIsNonZero && !NextIsNonZero)
8185       continue;
8186 
8187     // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8188     SDValue Elt;
8189     if (ThisIsNonZero) {
8190       if (NumZero || NextIsNonZero)
8191         Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8192       else
8193         Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8194     }
8195 
8196     if (NextIsNonZero) {
8197       SDValue NextElt = Op.getOperand(i + 1);
8198       if (i == 0 && NumZero)
8199         NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8200       else
8201         NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8202       NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8203                             DAG.getConstant(8, dl, MVT::i8));
8204       if (ThisIsNonZero)
8205         Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8206       else
8207         Elt = NextElt;
8208     }
8209 
8210     // If our first insertion is not the first index or zeros are needed, then
8211     // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8212     // elements undefined).
8213     if (!V) {
8214       if (i != 0 || NumZero)
8215         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8216       else {
8217         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8218         V = DAG.getBitcast(MVT::v8i16, V);
8219         continue;
8220       }
8221     }
8222     Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8223     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8224                     DAG.getIntPtrConstant(i / 2, dl));
8225   }
8226 
8227   return DAG.getBitcast(MVT::v16i8, V);
8228 }
8229 
8230 /// Custom lower build_vector of v8i16.
8231 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8232                                      unsigned NumNonZero, unsigned NumZero,
8233                                      SelectionDAG &DAG,
8234                                      const X86Subtarget &Subtarget) {
8235   if (NumNonZero > 4 && !Subtarget.hasSSE41())
8236     return SDValue();
8237 
8238   // Use PINSRW to insert each byte directly.
8239   return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8240                                   Subtarget);
8241 }
8242 
8243 /// Custom lower build_vector of v4i32 or v4f32.
8244 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8245                                      const X86Subtarget &Subtarget) {
8246   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8247   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8248   // Because we're creating a less complicated build vector here, we may enable
8249   // further folding of the MOVDDUP via shuffle transforms.
8250   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8251       Op.getOperand(0) == Op.getOperand(2) &&
8252       Op.getOperand(1) == Op.getOperand(3) &&
8253       Op.getOperand(0) != Op.getOperand(1)) {
8254     SDLoc DL(Op);
8255     MVT VT = Op.getSimpleValueType();
8256     MVT EltVT = VT.getVectorElementType();
8257     // Create a new build vector with the first 2 elements followed by undef
8258     // padding, bitcast to v2f64, duplicate, and bitcast back.
8259     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8260                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8261     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8262     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8263     return DAG.getBitcast(VT, Dup);
8264   }
8265 
8266   // Find all zeroable elements.
8267   std::bitset<4> Zeroable, Undefs;
8268   for (int i = 0; i < 4; ++i) {
8269     SDValue Elt = Op.getOperand(i);
8270     Undefs[i] = Elt.isUndef();
8271     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8272   }
8273   assert(Zeroable.size() - Zeroable.count() > 1 &&
8274          "We expect at least two non-zero elements!");
8275 
8276   // We only know how to deal with build_vector nodes where elements are either
8277   // zeroable or extract_vector_elt with constant index.
8278   SDValue FirstNonZero;
8279   unsigned FirstNonZeroIdx;
8280   for (unsigned i = 0; i < 4; ++i) {
8281     if (Zeroable[i])
8282       continue;
8283     SDValue Elt = Op.getOperand(i);
8284     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8285         !isa<ConstantSDNode>(Elt.getOperand(1)))
8286       return SDValue();
8287     // Make sure that this node is extracting from a 128-bit vector.
8288     MVT VT = Elt.getOperand(0).getSimpleValueType();
8289     if (!VT.is128BitVector())
8290       return SDValue();
8291     if (!FirstNonZero.getNode()) {
8292       FirstNonZero = Elt;
8293       FirstNonZeroIdx = i;
8294     }
8295   }
8296 
8297   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
8298   SDValue V1 = FirstNonZero.getOperand(0);
8299   MVT VT = V1.getSimpleValueType();
8300 
8301   // See if this build_vector can be lowered as a blend with zero.
8302   SDValue Elt;
8303   unsigned EltMaskIdx, EltIdx;
8304   int Mask[4];
8305   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8306     if (Zeroable[EltIdx]) {
8307       // The zero vector will be on the right hand side.
8308       Mask[EltIdx] = EltIdx+4;
8309       continue;
8310     }
8311 
8312     Elt = Op->getOperand(EltIdx);
8313     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8314     EltMaskIdx = Elt.getConstantOperandVal(1);
8315     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8316       break;
8317     Mask[EltIdx] = EltIdx;
8318   }
8319 
8320   if (EltIdx == 4) {
8321     // Let the shuffle legalizer deal with blend operations.
8322     SDValue VZeroOrUndef = (Zeroable == Undefs)
8323                                ? DAG.getUNDEF(VT)
8324                                : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8325     if (V1.getSimpleValueType() != VT)
8326       V1 = DAG.getBitcast(VT, V1);
8327     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8328   }
8329 
8330   // See if we can lower this build_vector to a INSERTPS.
8331   if (!Subtarget.hasSSE41())
8332     return SDValue();
8333 
8334   SDValue V2 = Elt.getOperand(0);
8335   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8336     V1 = SDValue();
8337 
8338   bool CanFold = true;
8339   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8340     if (Zeroable[i])
8341       continue;
8342 
8343     SDValue Current = Op->getOperand(i);
8344     SDValue SrcVector = Current->getOperand(0);
8345     if (!V1.getNode())
8346       V1 = SrcVector;
8347     CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8348   }
8349 
8350   if (!CanFold)
8351     return SDValue();
8352 
8353   assert(V1.getNode() && "Expected at least two non-zero elements!");
8354   if (V1.getSimpleValueType() != MVT::v4f32)
8355     V1 = DAG.getBitcast(MVT::v4f32, V1);
8356   if (V2.getSimpleValueType() != MVT::v4f32)
8357     V2 = DAG.getBitcast(MVT::v4f32, V2);
8358 
8359   // Ok, we can emit an INSERTPS instruction.
8360   unsigned ZMask = Zeroable.to_ulong();
8361 
8362   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8363   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8364   SDLoc DL(Op);
8365   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8366                                DAG.getIntPtrConstant(InsertPSMask, DL, true));
8367   return DAG.getBitcast(VT, Result);
8368 }
8369 
8370 /// Return a vector logical shift node.
8371 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8372                          SelectionDAG &DAG, const TargetLowering &TLI,
8373                          const SDLoc &dl) {
8374   assert(VT.is128BitVector() && "Unknown type for VShift");
8375   MVT ShVT = MVT::v16i8;
8376   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8377   SrcOp = DAG.getBitcast(ShVT, SrcOp);
8378   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
8379   SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8380   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8381 }
8382 
8383 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8384                                       SelectionDAG &DAG) {
8385 
8386   // Check if the scalar load can be widened into a vector load. And if
8387   // the address is "base + cst" see if the cst can be "absorbed" into
8388   // the shuffle mask.
8389   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8390     SDValue Ptr = LD->getBasePtr();
8391     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8392       return SDValue();
8393     EVT PVT = LD->getValueType(0);
8394     if (PVT != MVT::i32 && PVT != MVT::f32)
8395       return SDValue();
8396 
8397     int FI = -1;
8398     int64_t Offset = 0;
8399     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8400       FI = FINode->getIndex();
8401       Offset = 0;
8402     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8403                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8404       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8405       Offset = Ptr.getConstantOperandVal(1);
8406       Ptr = Ptr.getOperand(0);
8407     } else {
8408       return SDValue();
8409     }
8410 
8411     // FIXME: 256-bit vector instructions don't require a strict alignment,
8412     // improve this code to support it better.
8413     Align RequiredAlign(VT.getSizeInBits() / 8);
8414     SDValue Chain = LD->getChain();
8415     // Make sure the stack object alignment is at least 16 or 32.
8416     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8417     MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8418     if (!InferredAlign || *InferredAlign < RequiredAlign) {
8419       if (MFI.isFixedObjectIndex(FI)) {
8420         // Can't change the alignment. FIXME: It's possible to compute
8421         // the exact stack offset and reference FI + adjust offset instead.
8422         // If someone *really* cares about this. That's the way to implement it.
8423         return SDValue();
8424       } else {
8425         MFI.setObjectAlignment(FI, RequiredAlign);
8426       }
8427     }
8428 
8429     // (Offset % 16 or 32) must be multiple of 4. Then address is then
8430     // Ptr + (Offset & ~15).
8431     if (Offset < 0)
8432       return SDValue();
8433     if ((Offset % RequiredAlign.value()) & 3)
8434       return SDValue();
8435     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8436     if (StartOffset) {
8437       SDLoc DL(Ptr);
8438       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8439                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8440     }
8441 
8442     int EltNo = (Offset - StartOffset) >> 2;
8443     unsigned NumElems = VT.getVectorNumElements();
8444 
8445     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8446     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8447                              LD->getPointerInfo().getWithOffset(StartOffset));
8448 
8449     SmallVector<int, 8> Mask(NumElems, EltNo);
8450 
8451     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8452   }
8453 
8454   return SDValue();
8455 }
8456 
8457 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8458 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8459   if (ISD::isNON_EXTLoad(Elt.getNode())) {
8460     auto *BaseLd = cast<LoadSDNode>(Elt);
8461     if (!BaseLd->isSimple())
8462       return false;
8463     Ld = BaseLd;
8464     ByteOffset = 0;
8465     return true;
8466   }
8467 
8468   switch (Elt.getOpcode()) {
8469   case ISD::BITCAST:
8470   case ISD::TRUNCATE:
8471   case ISD::SCALAR_TO_VECTOR:
8472     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8473   case ISD::SRL:
8474     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8475       uint64_t Idx = IdxC->getZExtValue();
8476       if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8477         ByteOffset += Idx / 8;
8478         return true;
8479       }
8480     }
8481     break;
8482   case ISD::EXTRACT_VECTOR_ELT:
8483     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8484       SDValue Src = Elt.getOperand(0);
8485       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8486       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8487       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8488           findEltLoadSrc(Src, Ld, ByteOffset)) {
8489         uint64_t Idx = IdxC->getZExtValue();
8490         ByteOffset += Idx * (SrcSizeInBits / 8);
8491         return true;
8492       }
8493     }
8494     break;
8495   }
8496 
8497   return false;
8498 }
8499 
8500 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8501 /// elements can be replaced by a single large load which has the same value as
8502 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8503 ///
8504 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8505 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8506                                         const SDLoc &DL, SelectionDAG &DAG,
8507                                         const X86Subtarget &Subtarget,
8508                                         bool IsAfterLegalize) {
8509   if ((VT.getScalarSizeInBits() % 8) != 0)
8510     return SDValue();
8511 
8512   unsigned NumElems = Elts.size();
8513 
8514   int LastLoadedElt = -1;
8515   APInt LoadMask = APInt::getNullValue(NumElems);
8516   APInt ZeroMask = APInt::getNullValue(NumElems);
8517   APInt UndefMask = APInt::getNullValue(NumElems);
8518 
8519   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8520   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8521 
8522   // For each element in the initializer, see if we've found a load, zero or an
8523   // undef.
8524   for (unsigned i = 0; i < NumElems; ++i) {
8525     SDValue Elt = peekThroughBitcasts(Elts[i]);
8526     if (!Elt.getNode())
8527       return SDValue();
8528     if (Elt.isUndef()) {
8529       UndefMask.setBit(i);
8530       continue;
8531     }
8532     if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8533       ZeroMask.setBit(i);
8534       continue;
8535     }
8536 
8537     // Each loaded element must be the correct fractional portion of the
8538     // requested vector load.
8539     unsigned EltSizeInBits = Elt.getValueSizeInBits();
8540     if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8541       return SDValue();
8542 
8543     if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8544       return SDValue();
8545     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8546     if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8547       return SDValue();
8548 
8549     LoadMask.setBit(i);
8550     LastLoadedElt = i;
8551   }
8552   assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
8553           LoadMask.countPopulation()) == NumElems &&
8554          "Incomplete element masks");
8555 
8556   // Handle Special Cases - all undef or undef/zero.
8557   if (UndefMask.countPopulation() == NumElems)
8558     return DAG.getUNDEF(VT);
8559   if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8560     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8561                           : DAG.getConstantFP(0.0, DL, VT);
8562 
8563   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8564   int FirstLoadedElt = LoadMask.countTrailingZeros();
8565   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8566   EVT EltBaseVT = EltBase.getValueType();
8567   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
8568          "Register/Memory size mismatch");
8569   LoadSDNode *LDBase = Loads[FirstLoadedElt];
8570   assert(LDBase && "Did not find base load for merging consecutive loads");
8571   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8572   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8573   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8574   int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8575   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
8576 
8577   // TODO: Support offsetting the base load.
8578   if (ByteOffsets[FirstLoadedElt] != 0)
8579     return SDValue();
8580 
8581   // Check to see if the element's load is consecutive to the base load
8582   // or offset from a previous (already checked) load.
8583   auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8584     LoadSDNode *Ld = Loads[EltIdx];
8585     int64_t ByteOffset = ByteOffsets[EltIdx];
8586     if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8587       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8588       return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8589               Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8590     }
8591     return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8592                                               EltIdx - FirstLoadedElt);
8593   };
8594 
8595   // Consecutive loads can contain UNDEFS but not ZERO elements.
8596   // Consecutive loads with UNDEFs and ZEROs elements require a
8597   // an additional shuffle stage to clear the ZERO elements.
8598   bool IsConsecutiveLoad = true;
8599   bool IsConsecutiveLoadWithZeros = true;
8600   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8601     if (LoadMask[i]) {
8602       if (!CheckConsecutiveLoad(LDBase, i)) {
8603         IsConsecutiveLoad = false;
8604         IsConsecutiveLoadWithZeros = false;
8605         break;
8606       }
8607     } else if (ZeroMask[i]) {
8608       IsConsecutiveLoad = false;
8609     }
8610   }
8611 
8612   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8613     auto MMOFlags = LDBase->getMemOperand()->getFlags();
8614     assert(LDBase->isSimple() &&
8615            "Cannot merge volatile or atomic loads.");
8616     SDValue NewLd =
8617         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8618                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8619                     MMOFlags);
8620     for (auto *LD : Loads)
8621       if (LD)
8622         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8623     return NewLd;
8624   };
8625 
8626   // Check if the base load is entirely dereferenceable.
8627   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8628       VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8629 
8630   // LOAD - all consecutive load/undefs (must start/end with a load or be
8631   // entirely dereferenceable). If we have found an entire vector of loads and
8632   // undefs, then return a large load of the entire vector width starting at the
8633   // base pointer. If the vector contains zeros, then attempt to shuffle those
8634   // elements.
8635   if (FirstLoadedElt == 0 &&
8636       (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8637       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8638     if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8639       return SDValue();
8640 
8641     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8642     // will lower to regular temporal loads and use the cache.
8643     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8644         VT.is256BitVector() && !Subtarget.hasInt256())
8645       return SDValue();
8646 
8647     if (NumElems == 1)
8648       return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8649 
8650     if (!ZeroMask)
8651       return CreateLoad(VT, LDBase);
8652 
8653     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8654     // vector and a zero vector to clear out the zero elements.
8655     if (!IsAfterLegalize && VT.isVector()) {
8656       unsigned NumMaskElts = VT.getVectorNumElements();
8657       if ((NumMaskElts % NumElems) == 0) {
8658         unsigned Scale = NumMaskElts / NumElems;
8659         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8660         for (unsigned i = 0; i < NumElems; ++i) {
8661           if (UndefMask[i])
8662             continue;
8663           int Offset = ZeroMask[i] ? NumMaskElts : 0;
8664           for (unsigned j = 0; j != Scale; ++j)
8665             ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8666         }
8667         SDValue V = CreateLoad(VT, LDBase);
8668         SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8669                                    : DAG.getConstantFP(0.0, DL, VT);
8670         return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8671       }
8672     }
8673   }
8674 
8675   // If the upper half of a ymm/zmm load is undef then just load the lower half.
8676   if (VT.is256BitVector() || VT.is512BitVector()) {
8677     unsigned HalfNumElems = NumElems / 2;
8678     if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8679       EVT HalfVT =
8680           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8681       SDValue HalfLD =
8682           EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8683                                    DAG, Subtarget, IsAfterLegalize);
8684       if (HalfLD)
8685         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8686                            HalfLD, DAG.getIntPtrConstant(0, DL));
8687     }
8688   }
8689 
8690   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8691   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8692       (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8693       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8694     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8695                                       : MVT::getIntegerVT(LoadSizeInBits);
8696     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8697     // Allow v4f32 on SSE1 only targets.
8698     // FIXME: Add more isel patterns so we can just use VT directly.
8699     if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8700       VecVT = MVT::v4f32;
8701     if (TLI.isTypeLegal(VecVT)) {
8702       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8703       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8704       SDValue ResNode = DAG.getMemIntrinsicNode(
8705           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8706           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8707       for (auto *LD : Loads)
8708         if (LD)
8709           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8710       return DAG.getBitcast(VT, ResNode);
8711     }
8712   }
8713 
8714   // BROADCAST - match the smallest possible repetition pattern, load that
8715   // scalar/subvector element and then broadcast to the entire vector.
8716   if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8717       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8718     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8719       unsigned RepeatSize = SubElems * BaseSizeInBits;
8720       unsigned ScalarSize = std::min(RepeatSize, 64u);
8721       if (!Subtarget.hasAVX2() && ScalarSize < 32)
8722         continue;
8723 
8724       // Don't attempt a 1:N subvector broadcast - it should be caught by
8725       // combineConcatVectorOps, else will cause infinite loops.
8726       if (RepeatSize > ScalarSize && SubElems == 1)
8727         continue;
8728 
8729       bool Match = true;
8730       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8731       for (unsigned i = 0; i != NumElems && Match; ++i) {
8732         if (!LoadMask[i])
8733           continue;
8734         SDValue Elt = peekThroughBitcasts(Elts[i]);
8735         if (RepeatedLoads[i % SubElems].isUndef())
8736           RepeatedLoads[i % SubElems] = Elt;
8737         else
8738           Match &= (RepeatedLoads[i % SubElems] == Elt);
8739       }
8740 
8741       // We must have loads at both ends of the repetition.
8742       Match &= !RepeatedLoads.front().isUndef();
8743       Match &= !RepeatedLoads.back().isUndef();
8744       if (!Match)
8745         continue;
8746 
8747       EVT RepeatVT =
8748           VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8749               ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8750               : EVT::getFloatingPointVT(ScalarSize);
8751       if (RepeatSize > ScalarSize)
8752         RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8753                                     RepeatSize / ScalarSize);
8754       EVT BroadcastVT =
8755           EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8756                            VT.getSizeInBits() / ScalarSize);
8757       if (TLI.isTypeLegal(BroadcastVT)) {
8758         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8759                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8760           SDValue Broadcast = RepeatLoad;
8761           if (RepeatSize > ScalarSize) {
8762             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8763               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8764           } else {
8765             Broadcast =
8766                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8767           }
8768           return DAG.getBitcast(VT, Broadcast);
8769         }
8770       }
8771     }
8772   }
8773 
8774   return SDValue();
8775 }
8776 
8777 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8778 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8779 // are consecutive, non-overlapping, and in the right order.
8780 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8781                                          SelectionDAG &DAG,
8782                                          const X86Subtarget &Subtarget,
8783                                          bool IsAfterLegalize) {
8784   SmallVector<SDValue, 64> Elts;
8785   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8786     if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8787       Elts.push_back(Elt);
8788       continue;
8789     }
8790     return SDValue();
8791   }
8792   assert(Elts.size() == VT.getVectorNumElements());
8793   return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8794                                   IsAfterLegalize);
8795 }
8796 
8797 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8798                                    unsigned SplatBitSize, LLVMContext &C) {
8799   unsigned ScalarSize = VT.getScalarSizeInBits();
8800   unsigned NumElm = SplatBitSize / ScalarSize;
8801 
8802   SmallVector<Constant *, 32> ConstantVec;
8803   for (unsigned i = 0; i < NumElm; i++) {
8804     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8805     Constant *Const;
8806     if (VT.isFloatingPoint()) {
8807       if (ScalarSize == 32) {
8808         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8809       } else {
8810         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8811         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8812       }
8813     } else
8814       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8815     ConstantVec.push_back(Const);
8816   }
8817   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8818 }
8819 
8820 static bool isFoldableUseOfShuffle(SDNode *N) {
8821   for (auto *U : N->uses()) {
8822     unsigned Opc = U->getOpcode();
8823     // VPERMV/VPERMV3 shuffles can never fold their index operands.
8824     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8825       return false;
8826     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8827       return false;
8828     if (isTargetShuffle(Opc))
8829       return true;
8830     if (Opc == ISD::BITCAST) // Ignore bitcasts
8831       return isFoldableUseOfShuffle(U);
8832     if (N->hasOneUse())
8833       return true;
8834   }
8835   return false;
8836 }
8837 
8838 /// Attempt to use the vbroadcast instruction to generate a splat value
8839 /// from a splat BUILD_VECTOR which uses:
8840 ///  a. A single scalar load, or a constant.
8841 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8842 ///
8843 /// The VBROADCAST node is returned when a pattern is found,
8844 /// or SDValue() otherwise.
8845 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8846                                            const X86Subtarget &Subtarget,
8847                                            SelectionDAG &DAG) {
8848   // VBROADCAST requires AVX.
8849   // TODO: Splats could be generated for non-AVX CPUs using SSE
8850   // instructions, but there's less potential gain for only 128-bit vectors.
8851   if (!Subtarget.hasAVX())
8852     return SDValue();
8853 
8854   MVT VT = BVOp->getSimpleValueType(0);
8855   unsigned NumElts = VT.getVectorNumElements();
8856   SDLoc dl(BVOp);
8857 
8858   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8859          "Unsupported vector type for broadcast.");
8860 
8861   // See if the build vector is a repeating sequence of scalars (inc. splat).
8862   SDValue Ld;
8863   BitVector UndefElements;
8864   SmallVector<SDValue, 16> Sequence;
8865   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8866     assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8867     if (Sequence.size() == 1)
8868       Ld = Sequence[0];
8869   }
8870 
8871   // Attempt to use VBROADCASTM
8872   // From this pattern:
8873   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8874   // b. t1 = (build_vector t0 t0)
8875   //
8876   // Create (VBROADCASTM v2i1 X)
8877   if (!Sequence.empty() && Subtarget.hasCDI()) {
8878     // If not a splat, are the upper sequence values zeroable?
8879     unsigned SeqLen = Sequence.size();
8880     bool UpperZeroOrUndef =
8881         SeqLen == 1 ||
8882         llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8883           return !V || V.isUndef() || isNullConstant(V);
8884         });
8885     SDValue Op0 = Sequence[0];
8886     if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8887                              (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8888                               Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8889       SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8890                              ? Op0.getOperand(0)
8891                              : Op0.getOperand(0).getOperand(0);
8892       MVT MaskVT = BOperand.getSimpleValueType();
8893       MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8894       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
8895           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8896         MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8897         if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8898           unsigned Scale = 512 / VT.getSizeInBits();
8899           BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8900         }
8901         SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8902         if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8903           Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8904         return DAG.getBitcast(VT, Bcst);
8905       }
8906     }
8907   }
8908 
8909   unsigned NumUndefElts = UndefElements.count();
8910   if (!Ld || (NumElts - NumUndefElts) <= 1) {
8911     APInt SplatValue, Undef;
8912     unsigned SplatBitSize;
8913     bool HasUndef;
8914     // Check if this is a repeated constant pattern suitable for broadcasting.
8915     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8916         SplatBitSize > VT.getScalarSizeInBits() &&
8917         SplatBitSize < VT.getSizeInBits()) {
8918       // Avoid replacing with broadcast when it's a use of a shuffle
8919       // instruction to preserve the present custom lowering of shuffles.
8920       if (isFoldableUseOfShuffle(BVOp))
8921         return SDValue();
8922       // replace BUILD_VECTOR with broadcast of the repeated constants.
8923       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8924       LLVMContext *Ctx = DAG.getContext();
8925       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8926       if (Subtarget.hasAVX()) {
8927         if (SplatBitSize == 32 || SplatBitSize == 64 ||
8928             (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8929           // Splatted value can fit in one INTEGER constant in constant pool.
8930           // Load the constant and broadcast it.
8931           MVT CVT = MVT::getIntegerVT(SplatBitSize);
8932           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8933           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8934           SDValue CP = DAG.getConstantPool(C, PVT);
8935           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8936 
8937           Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8938           SDVTList Tys =
8939               DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8940           SDValue Ops[] = {DAG.getEntryNode(), CP};
8941           MachinePointerInfo MPI =
8942               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8943           SDValue Brdcst = DAG.getMemIntrinsicNode(
8944               X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8945               MachineMemOperand::MOLoad);
8946           return DAG.getBitcast(VT, Brdcst);
8947         }
8948         if (SplatBitSize > 64) {
8949           // Load the vector of constants and broadcast it.
8950           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8951                                              *Ctx);
8952           SDValue VCP = DAG.getConstantPool(VecC, PVT);
8953           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8954           MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8955           Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8956           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8957           SDValue Ops[] = {DAG.getEntryNode(), VCP};
8958           MachinePointerInfo MPI =
8959               MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8960           return DAG.getMemIntrinsicNode(
8961               X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8962               MachineMemOperand::MOLoad);
8963         }
8964       }
8965     }
8966 
8967     // If we are moving a scalar into a vector (Ld must be set and all elements
8968     // but 1 are undef) and that operation is not obviously supported by
8969     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8970     // That's better than general shuffling and may eliminate a load to GPR and
8971     // move from scalar to vector register.
8972     if (!Ld || NumElts - NumUndefElts != 1)
8973       return SDValue();
8974     unsigned ScalarSize = Ld.getValueSizeInBits();
8975     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8976       return SDValue();
8977   }
8978 
8979   bool ConstSplatVal =
8980       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8981   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8982 
8983   // TODO: Handle broadcasts of non-constant sequences.
8984 
8985   // Make sure that all of the users of a non-constant load are from the
8986   // BUILD_VECTOR node.
8987   // FIXME: Is the use count needed for non-constant, non-load case?
8988   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8989     return SDValue();
8990 
8991   unsigned ScalarSize = Ld.getValueSizeInBits();
8992   bool IsGE256 = (VT.getSizeInBits() >= 256);
8993 
8994   // When optimizing for size, generate up to 5 extra bytes for a broadcast
8995   // instruction to save 8 or more bytes of constant pool data.
8996   // TODO: If multiple splats are generated to load the same constant,
8997   // it may be detrimental to overall size. There needs to be a way to detect
8998   // that condition to know if this is truly a size win.
8999   bool OptForSize = DAG.shouldOptForSize();
9000 
9001   // Handle broadcasting a single constant scalar from the constant pool
9002   // into a vector.
9003   // On Sandybridge (no AVX2), it is still better to load a constant vector
9004   // from the constant pool and not to broadcast it from a scalar.
9005   // But override that restriction when optimizing for size.
9006   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9007   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9008     EVT CVT = Ld.getValueType();
9009     assert(!CVT.isVector() && "Must not broadcast a vector type");
9010 
9011     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9012     // For size optimization, also splat v2f64 and v2i64, and for size opt
9013     // with AVX2, also splat i8 and i16.
9014     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9015     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9016         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9017       const Constant *C = nullptr;
9018       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9019         C = CI->getConstantIntValue();
9020       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9021         C = CF->getConstantFPValue();
9022 
9023       assert(C && "Invalid constant type");
9024 
9025       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9026       SDValue CP =
9027           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9028       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9029 
9030       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9031       SDValue Ops[] = {DAG.getEntryNode(), CP};
9032       MachinePointerInfo MPI =
9033           MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9034       return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9035                                      MPI, Alignment, MachineMemOperand::MOLoad);
9036     }
9037   }
9038 
9039   // Handle AVX2 in-register broadcasts.
9040   if (!IsLoad && Subtarget.hasInt256() &&
9041       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9042     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9043 
9044   // The scalar source must be a normal load.
9045   if (!IsLoad)
9046     return SDValue();
9047 
9048   // Make sure the non-chain result is only used by this build vector.
9049   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9050     return SDValue();
9051 
9052   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9053       (Subtarget.hasVLX() && ScalarSize == 64)) {
9054     auto *LN = cast<LoadSDNode>(Ld);
9055     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9056     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9057     SDValue BCast =
9058         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9059                                 LN->getMemoryVT(), LN->getMemOperand());
9060     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9061     return BCast;
9062   }
9063 
9064   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9065   // double since there is no vbroadcastsd xmm
9066   if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9067       (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9068     auto *LN = cast<LoadSDNode>(Ld);
9069     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9070     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9071     SDValue BCast =
9072         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9073                                 LN->getMemoryVT(), LN->getMemOperand());
9074     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9075     return BCast;
9076   }
9077 
9078   // Unsupported broadcast.
9079   return SDValue();
9080 }
9081 
9082 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
9083 /// underlying vector and index.
9084 ///
9085 /// Modifies \p ExtractedFromVec to the real vector and returns the real
9086 /// index.
9087 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9088                                          SDValue ExtIdx) {
9089   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9090   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9091     return Idx;
9092 
9093   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9094   // lowered this:
9095   //   (extract_vector_elt (v8f32 %1), Constant<6>)
9096   // to:
9097   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
9098   //                           (extract_subvector (v8f32 %0), Constant<4>),
9099   //                           undef)
9100   //                       Constant<0>)
9101   // In this case the vector is the extract_subvector expression and the index
9102   // is 2, as specified by the shuffle.
9103   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9104   SDValue ShuffleVec = SVOp->getOperand(0);
9105   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9106   assert(ShuffleVecVT.getVectorElementType() ==
9107          ExtractedFromVec.getSimpleValueType().getVectorElementType());
9108 
9109   int ShuffleIdx = SVOp->getMaskElt(Idx);
9110   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9111     ExtractedFromVec = ShuffleVec;
9112     return ShuffleIdx;
9113   }
9114   return Idx;
9115 }
9116 
9117 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9118   MVT VT = Op.getSimpleValueType();
9119 
9120   // Skip if insert_vec_elt is not supported.
9121   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9122   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9123     return SDValue();
9124 
9125   SDLoc DL(Op);
9126   unsigned NumElems = Op.getNumOperands();
9127 
9128   SDValue VecIn1;
9129   SDValue VecIn2;
9130   SmallVector<unsigned, 4> InsertIndices;
9131   SmallVector<int, 8> Mask(NumElems, -1);
9132 
9133   for (unsigned i = 0; i != NumElems; ++i) {
9134     unsigned Opc = Op.getOperand(i).getOpcode();
9135 
9136     if (Opc == ISD::UNDEF)
9137       continue;
9138 
9139     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9140       // Quit if more than 1 elements need inserting.
9141       if (InsertIndices.size() > 1)
9142         return SDValue();
9143 
9144       InsertIndices.push_back(i);
9145       continue;
9146     }
9147 
9148     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9149     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9150 
9151     // Quit if non-constant index.
9152     if (!isa<ConstantSDNode>(ExtIdx))
9153       return SDValue();
9154     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9155 
9156     // Quit if extracted from vector of different type.
9157     if (ExtractedFromVec.getValueType() != VT)
9158       return SDValue();
9159 
9160     if (!VecIn1.getNode())
9161       VecIn1 = ExtractedFromVec;
9162     else if (VecIn1 != ExtractedFromVec) {
9163       if (!VecIn2.getNode())
9164         VecIn2 = ExtractedFromVec;
9165       else if (VecIn2 != ExtractedFromVec)
9166         // Quit if more than 2 vectors to shuffle
9167         return SDValue();
9168     }
9169 
9170     if (ExtractedFromVec == VecIn1)
9171       Mask[i] = Idx;
9172     else if (ExtractedFromVec == VecIn2)
9173       Mask[i] = Idx + NumElems;
9174   }
9175 
9176   if (!VecIn1.getNode())
9177     return SDValue();
9178 
9179   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9180   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9181 
9182   for (unsigned Idx : InsertIndices)
9183     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9184                      DAG.getIntPtrConstant(Idx, DL));
9185 
9186   return NV;
9187 }
9188 
9189 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9190 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9191                                      const X86Subtarget &Subtarget) {
9192 
9193   MVT VT = Op.getSimpleValueType();
9194   assert((VT.getVectorElementType() == MVT::i1) &&
9195          "Unexpected type in LowerBUILD_VECTORvXi1!");
9196 
9197   SDLoc dl(Op);
9198   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9199       ISD::isBuildVectorAllOnes(Op.getNode()))
9200     return Op;
9201 
9202   uint64_t Immediate = 0;
9203   SmallVector<unsigned, 16> NonConstIdx;
9204   bool IsSplat = true;
9205   bool HasConstElts = false;
9206   int SplatIdx = -1;
9207   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9208     SDValue In = Op.getOperand(idx);
9209     if (In.isUndef())
9210       continue;
9211     if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9212       Immediate |= (InC->getZExtValue() & 0x1) << idx;
9213       HasConstElts = true;
9214     } else {
9215       NonConstIdx.push_back(idx);
9216     }
9217     if (SplatIdx < 0)
9218       SplatIdx = idx;
9219     else if (In != Op.getOperand(SplatIdx))
9220       IsSplat = false;
9221   }
9222 
9223   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9224   if (IsSplat) {
9225     // The build_vector allows the scalar element to be larger than the vector
9226     // element type. We need to mask it to use as a condition unless we know
9227     // the upper bits are zero.
9228     // FIXME: Use computeKnownBits instead of checking specific opcode?
9229     SDValue Cond = Op.getOperand(SplatIdx);
9230     assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
9231     if (Cond.getOpcode() != ISD::SETCC)
9232       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9233                          DAG.getConstant(1, dl, MVT::i8));
9234 
9235     // Perform the select in the scalar domain so we can use cmov.
9236     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9237       SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9238                                      DAG.getAllOnesConstant(dl, MVT::i32),
9239                                      DAG.getConstant(0, dl, MVT::i32));
9240       Select = DAG.getBitcast(MVT::v32i1, Select);
9241       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9242     } else {
9243       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9244       SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9245                                      DAG.getAllOnesConstant(dl, ImmVT),
9246                                      DAG.getConstant(0, dl, ImmVT));
9247       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9248       Select = DAG.getBitcast(VecVT, Select);
9249       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9250                          DAG.getIntPtrConstant(0, dl));
9251     }
9252   }
9253 
9254   // insert elements one by one
9255   SDValue DstVec;
9256   if (HasConstElts) {
9257     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9258       SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9259       SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9260       ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9261       ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9262       DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9263     } else {
9264       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9265       SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9266       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9267       DstVec = DAG.getBitcast(VecVT, Imm);
9268       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9269                            DAG.getIntPtrConstant(0, dl));
9270     }
9271   } else
9272     DstVec = DAG.getUNDEF(VT);
9273 
9274   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9275     unsigned InsertIdx = NonConstIdx[i];
9276     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9277                          Op.getOperand(InsertIdx),
9278                          DAG.getIntPtrConstant(InsertIdx, dl));
9279   }
9280   return DstVec;
9281 }
9282 
9283 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
9284   switch (Opcode) {
9285   case X86ISD::PACKSS:
9286   case X86ISD::PACKUS:
9287   case X86ISD::FHADD:
9288   case X86ISD::FHSUB:
9289   case X86ISD::HADD:
9290   case X86ISD::HSUB:
9291     return true;
9292   }
9293   return false;
9294 }
9295 
9296 /// This is a helper function of LowerToHorizontalOp().
9297 /// This function checks that the build_vector \p N in input implements a
9298 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9299 /// may not match the layout of an x86 256-bit horizontal instruction.
9300 /// In other words, if this returns true, then some extraction/insertion will
9301 /// be required to produce a valid horizontal instruction.
9302 ///
9303 /// Parameter \p Opcode defines the kind of horizontal operation to match.
9304 /// For example, if \p Opcode is equal to ISD::ADD, then this function
9305 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9306 /// is equal to ISD::SUB, then this function checks if this is a horizontal
9307 /// arithmetic sub.
9308 ///
9309 /// This function only analyzes elements of \p N whose indices are
9310 /// in range [BaseIdx, LastIdx).
9311 ///
9312 /// TODO: This function was originally used to match both real and fake partial
9313 /// horizontal operations, but the index-matching logic is incorrect for that.
9314 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
9315 /// code because it is only used for partial h-op matching now?
9316 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9317                                   SelectionDAG &DAG,
9318                                   unsigned BaseIdx, unsigned LastIdx,
9319                                   SDValue &V0, SDValue &V1) {
9320   EVT VT = N->getValueType(0);
9321   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
9322   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
9323   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
9324          "Invalid Vector in input!");
9325 
9326   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9327   bool CanFold = true;
9328   unsigned ExpectedVExtractIdx = BaseIdx;
9329   unsigned NumElts = LastIdx - BaseIdx;
9330   V0 = DAG.getUNDEF(VT);
9331   V1 = DAG.getUNDEF(VT);
9332 
9333   // Check if N implements a horizontal binop.
9334   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9335     SDValue Op = N->getOperand(i + BaseIdx);
9336 
9337     // Skip UNDEFs.
9338     if (Op->isUndef()) {
9339       // Update the expected vector extract index.
9340       if (i * 2 == NumElts)
9341         ExpectedVExtractIdx = BaseIdx;
9342       ExpectedVExtractIdx += 2;
9343       continue;
9344     }
9345 
9346     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9347 
9348     if (!CanFold)
9349       break;
9350 
9351     SDValue Op0 = Op.getOperand(0);
9352     SDValue Op1 = Op.getOperand(1);
9353 
9354     // Try to match the following pattern:
9355     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9356     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9357         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9358         Op0.getOperand(0) == Op1.getOperand(0) &&
9359         isa<ConstantSDNode>(Op0.getOperand(1)) &&
9360         isa<ConstantSDNode>(Op1.getOperand(1)));
9361     if (!CanFold)
9362       break;
9363 
9364     unsigned I0 = Op0.getConstantOperandVal(1);
9365     unsigned I1 = Op1.getConstantOperandVal(1);
9366 
9367     if (i * 2 < NumElts) {
9368       if (V0.isUndef()) {
9369         V0 = Op0.getOperand(0);
9370         if (V0.getValueType() != VT)
9371           return false;
9372       }
9373     } else {
9374       if (V1.isUndef()) {
9375         V1 = Op0.getOperand(0);
9376         if (V1.getValueType() != VT)
9377           return false;
9378       }
9379       if (i * 2 == NumElts)
9380         ExpectedVExtractIdx = BaseIdx;
9381     }
9382 
9383     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9384     if (I0 == ExpectedVExtractIdx)
9385       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9386     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9387       // Try to match the following dag sequence:
9388       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9389       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9390     } else
9391       CanFold = false;
9392 
9393     ExpectedVExtractIdx += 2;
9394   }
9395 
9396   return CanFold;
9397 }
9398 
9399 /// Emit a sequence of two 128-bit horizontal add/sub followed by
9400 /// a concat_vector.
9401 ///
9402 /// This is a helper function of LowerToHorizontalOp().
9403 /// This function expects two 256-bit vectors called V0 and V1.
9404 /// At first, each vector is split into two separate 128-bit vectors.
9405 /// Then, the resulting 128-bit vectors are used to implement two
9406 /// horizontal binary operations.
9407 ///
9408 /// The kind of horizontal binary operation is defined by \p X86Opcode.
9409 ///
9410 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9411 /// the two new horizontal binop.
9412 /// When Mode is set, the first horizontal binop dag node would take as input
9413 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9414 /// horizontal binop dag node would take as input the lower 128-bit of V1
9415 /// and the upper 128-bit of V1.
9416 ///   Example:
9417 ///     HADD V0_LO, V0_HI
9418 ///     HADD V1_LO, V1_HI
9419 ///
9420 /// Otherwise, the first horizontal binop dag node takes as input the lower
9421 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9422 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9423 ///   Example:
9424 ///     HADD V0_LO, V1_LO
9425 ///     HADD V0_HI, V1_HI
9426 ///
9427 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9428 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9429 /// the upper 128-bits of the result.
9430 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9431                                      const SDLoc &DL, SelectionDAG &DAG,
9432                                      unsigned X86Opcode, bool Mode,
9433                                      bool isUndefLO, bool isUndefHI) {
9434   MVT VT = V0.getSimpleValueType();
9435   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
9436          "Invalid nodes in input!");
9437 
9438   unsigned NumElts = VT.getVectorNumElements();
9439   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9440   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9441   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9442   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9443   MVT NewVT = V0_LO.getSimpleValueType();
9444 
9445   SDValue LO = DAG.getUNDEF(NewVT);
9446   SDValue HI = DAG.getUNDEF(NewVT);
9447 
9448   if (Mode) {
9449     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9450     if (!isUndefLO && !V0->isUndef())
9451       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9452     if (!isUndefHI && !V1->isUndef())
9453       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9454   } else {
9455     // Don't emit a horizontal binop if the result is expected to be UNDEF.
9456     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9457       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9458 
9459     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9460       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9461   }
9462 
9463   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9464 }
9465 
9466 /// Returns true iff \p BV builds a vector with the result equivalent to
9467 /// the result of ADDSUB/SUBADD operation.
9468 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9469 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9470 /// \p Opnd0 and \p Opnd1.
9471 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9472                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
9473                              SDValue &Opnd0, SDValue &Opnd1,
9474                              unsigned &NumExtracts,
9475                              bool &IsSubAdd) {
9476 
9477   MVT VT = BV->getSimpleValueType(0);
9478   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9479     return false;
9480 
9481   unsigned NumElts = VT.getVectorNumElements();
9482   SDValue InVec0 = DAG.getUNDEF(VT);
9483   SDValue InVec1 = DAG.getUNDEF(VT);
9484 
9485   NumExtracts = 0;
9486 
9487   // Odd-numbered elements in the input build vector are obtained from
9488   // adding/subtracting two integer/float elements.
9489   // Even-numbered elements in the input build vector are obtained from
9490   // subtracting/adding two integer/float elements.
9491   unsigned Opc[2] = {0, 0};
9492   for (unsigned i = 0, e = NumElts; i != e; ++i) {
9493     SDValue Op = BV->getOperand(i);
9494 
9495     // Skip 'undef' values.
9496     unsigned Opcode = Op.getOpcode();
9497     if (Opcode == ISD::UNDEF)
9498       continue;
9499 
9500     // Early exit if we found an unexpected opcode.
9501     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9502       return false;
9503 
9504     SDValue Op0 = Op.getOperand(0);
9505     SDValue Op1 = Op.getOperand(1);
9506 
9507     // Try to match the following pattern:
9508     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9509     // Early exit if we cannot match that sequence.
9510     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9511         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9512         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9513         Op0.getOperand(1) != Op1.getOperand(1))
9514       return false;
9515 
9516     unsigned I0 = Op0.getConstantOperandVal(1);
9517     if (I0 != i)
9518       return false;
9519 
9520     // We found a valid add/sub node, make sure its the same opcode as previous
9521     // elements for this parity.
9522     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9523       return false;
9524     Opc[i % 2] = Opcode;
9525 
9526     // Update InVec0 and InVec1.
9527     if (InVec0.isUndef()) {
9528       InVec0 = Op0.getOperand(0);
9529       if (InVec0.getSimpleValueType() != VT)
9530         return false;
9531     }
9532     if (InVec1.isUndef()) {
9533       InVec1 = Op1.getOperand(0);
9534       if (InVec1.getSimpleValueType() != VT)
9535         return false;
9536     }
9537 
9538     // Make sure that operands in input to each add/sub node always
9539     // come from a same pair of vectors.
9540     if (InVec0 != Op0.getOperand(0)) {
9541       if (Opcode == ISD::FSUB)
9542         return false;
9543 
9544       // FADD is commutable. Try to commute the operands
9545       // and then test again.
9546       std::swap(Op0, Op1);
9547       if (InVec0 != Op0.getOperand(0))
9548         return false;
9549     }
9550 
9551     if (InVec1 != Op1.getOperand(0))
9552       return false;
9553 
9554     // Increment the number of extractions done.
9555     ++NumExtracts;
9556   }
9557 
9558   // Ensure we have found an opcode for both parities and that they are
9559   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9560   // inputs are undef.
9561   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9562       InVec0.isUndef() || InVec1.isUndef())
9563     return false;
9564 
9565   IsSubAdd = Opc[0] == ISD::FADD;
9566 
9567   Opnd0 = InVec0;
9568   Opnd1 = InVec1;
9569   return true;
9570 }
9571 
9572 /// Returns true if is possible to fold MUL and an idiom that has already been
9573 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9574 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9575 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9576 ///
9577 /// Prior to calling this function it should be known that there is some
9578 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9579 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9580 /// before replacement of such SDNode with ADDSUB operation. Thus the number
9581 /// of \p Opnd0 uses is expected to be equal to 2.
9582 /// For example, this function may be called for the following IR:
9583 ///    %AB = fmul fast <2 x double> %A, %B
9584 ///    %Sub = fsub fast <2 x double> %AB, %C
9585 ///    %Add = fadd fast <2 x double> %AB, %C
9586 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9587 ///                            <2 x i32> <i32 0, i32 3>
9588 /// There is a def for %Addsub here, which potentially can be replaced by
9589 /// X86ISD::ADDSUB operation:
9590 ///    %Addsub = X86ISD::ADDSUB %AB, %C
9591 /// and such ADDSUB can further be replaced with FMADDSUB:
9592 ///    %Addsub = FMADDSUB %A, %B, %C.
9593 ///
9594 /// The main reason why this method is called before the replacement of the
9595 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9596 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9597 /// FMADDSUB is.
9598 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9599                                  SelectionDAG &DAG,
9600                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9601                                  unsigned ExpectedUses) {
9602   if (Opnd0.getOpcode() != ISD::FMUL ||
9603       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9604     return false;
9605 
9606   // FIXME: These checks must match the similar ones in
9607   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9608   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9609   // or MUL + ADDSUB to FMADDSUB.
9610   const TargetOptions &Options = DAG.getTarget().Options;
9611   bool AllowFusion =
9612       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9613   if (!AllowFusion)
9614     return false;
9615 
9616   Opnd2 = Opnd1;
9617   Opnd1 = Opnd0.getOperand(1);
9618   Opnd0 = Opnd0.getOperand(0);
9619 
9620   return true;
9621 }
9622 
9623 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9624 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9625 /// X86ISD::FMSUBADD node.
9626 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9627                                        const X86Subtarget &Subtarget,
9628                                        SelectionDAG &DAG) {
9629   SDValue Opnd0, Opnd1;
9630   unsigned NumExtracts;
9631   bool IsSubAdd;
9632   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9633                         IsSubAdd))
9634     return SDValue();
9635 
9636   MVT VT = BV->getSimpleValueType(0);
9637   SDLoc DL(BV);
9638 
9639   // Try to generate X86ISD::FMADDSUB node here.
9640   SDValue Opnd2;
9641   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9642     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9643     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9644   }
9645 
9646   // We only support ADDSUB.
9647   if (IsSubAdd)
9648     return SDValue();
9649 
9650   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9651   // the ADDSUB idiom has been successfully recognized. There are no known
9652   // X86 targets with 512-bit ADDSUB instructions!
9653   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9654   // recognition.
9655   if (VT.is512BitVector())
9656     return SDValue();
9657 
9658   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9659 }
9660 
9661 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9662                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9663   // Initialize outputs to known values.
9664   MVT VT = BV->getSimpleValueType(0);
9665   HOpcode = ISD::DELETED_NODE;
9666   V0 = DAG.getUNDEF(VT);
9667   V1 = DAG.getUNDEF(VT);
9668 
9669   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9670   // half of the result is calculated independently from the 128-bit halves of
9671   // the inputs, so that makes the index-checking logic below more complicated.
9672   unsigned NumElts = VT.getVectorNumElements();
9673   unsigned GenericOpcode = ISD::DELETED_NODE;
9674   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9675   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9676   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9677   for (unsigned i = 0; i != Num128BitChunks; ++i) {
9678     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9679       // Ignore undef elements.
9680       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9681       if (Op.isUndef())
9682         continue;
9683 
9684       // If there's an opcode mismatch, we're done.
9685       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9686         return false;
9687 
9688       // Initialize horizontal opcode.
9689       if (HOpcode == ISD::DELETED_NODE) {
9690         GenericOpcode = Op.getOpcode();
9691         switch (GenericOpcode) {
9692         case ISD::ADD: HOpcode = X86ISD::HADD; break;
9693         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9694         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9695         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9696         default: return false;
9697         }
9698       }
9699 
9700       SDValue Op0 = Op.getOperand(0);
9701       SDValue Op1 = Op.getOperand(1);
9702       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9703           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9704           Op0.getOperand(0) != Op1.getOperand(0) ||
9705           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9706           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9707         return false;
9708 
9709       // The source vector is chosen based on which 64-bit half of the
9710       // destination vector is being calculated.
9711       if (j < NumEltsIn64Bits) {
9712         if (V0.isUndef())
9713           V0 = Op0.getOperand(0);
9714       } else {
9715         if (V1.isUndef())
9716           V1 = Op0.getOperand(0);
9717       }
9718 
9719       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9720       if (SourceVec != Op0.getOperand(0))
9721         return false;
9722 
9723       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9724       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9725       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9726       unsigned ExpectedIndex = i * NumEltsIn128Bits +
9727                                (j % NumEltsIn64Bits) * 2;
9728       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9729         continue;
9730 
9731       // If this is not a commutative op, this does not match.
9732       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9733         return false;
9734 
9735       // Addition is commutative, so try swapping the extract indexes.
9736       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9737       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9738         continue;
9739 
9740       // Extract indexes do not match horizontal requirement.
9741       return false;
9742     }
9743   }
9744   // We matched. Opcode and operands are returned by reference as arguments.
9745   return true;
9746 }
9747 
9748 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9749                                     SelectionDAG &DAG, unsigned HOpcode,
9750                                     SDValue V0, SDValue V1) {
9751   // If either input vector is not the same size as the build vector,
9752   // extract/insert the low bits to the correct size.
9753   // This is free (examples: zmm --> xmm, xmm --> ymm).
9754   MVT VT = BV->getSimpleValueType(0);
9755   unsigned Width = VT.getSizeInBits();
9756   if (V0.getValueSizeInBits() > Width)
9757     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9758   else if (V0.getValueSizeInBits() < Width)
9759     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9760 
9761   if (V1.getValueSizeInBits() > Width)
9762     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9763   else if (V1.getValueSizeInBits() < Width)
9764     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9765 
9766   unsigned NumElts = VT.getVectorNumElements();
9767   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9768   for (unsigned i = 0; i != NumElts; ++i)
9769     if (BV->getOperand(i).isUndef())
9770       DemandedElts.clearBit(i);
9771 
9772   // If we don't need the upper xmm, then perform as a xmm hop.
9773   unsigned HalfNumElts = NumElts / 2;
9774   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9775     MVT HalfVT = VT.getHalfNumVectorElementsVT();
9776     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9777     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9778     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9779     return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9780   }
9781 
9782   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9783 }
9784 
9785 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9786 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9787                                    const X86Subtarget &Subtarget,
9788                                    SelectionDAG &DAG) {
9789   // We need at least 2 non-undef elements to make this worthwhile by default.
9790   unsigned NumNonUndefs =
9791       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9792   if (NumNonUndefs < 2)
9793     return SDValue();
9794 
9795   // There are 4 sets of horizontal math operations distinguished by type:
9796   // int/FP at 128-bit/256-bit. Each type was introduced with a different
9797   // subtarget feature. Try to match those "native" patterns first.
9798   MVT VT = BV->getSimpleValueType(0);
9799   if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9800       ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9801       ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9802       ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9803     unsigned HOpcode;
9804     SDValue V0, V1;
9805     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9806       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9807   }
9808 
9809   // Try harder to match 256-bit ops by using extract/concat.
9810   if (!Subtarget.hasAVX() || !VT.is256BitVector())
9811     return SDValue();
9812 
9813   // Count the number of UNDEF operands in the build_vector in input.
9814   unsigned NumElts = VT.getVectorNumElements();
9815   unsigned Half = NumElts / 2;
9816   unsigned NumUndefsLO = 0;
9817   unsigned NumUndefsHI = 0;
9818   for (unsigned i = 0, e = Half; i != e; ++i)
9819     if (BV->getOperand(i)->isUndef())
9820       NumUndefsLO++;
9821 
9822   for (unsigned i = Half, e = NumElts; i != e; ++i)
9823     if (BV->getOperand(i)->isUndef())
9824       NumUndefsHI++;
9825 
9826   SDLoc DL(BV);
9827   SDValue InVec0, InVec1;
9828   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9829     SDValue InVec2, InVec3;
9830     unsigned X86Opcode;
9831     bool CanFold = true;
9832 
9833     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9834         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9835                               InVec3) &&
9836         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9837         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9838       X86Opcode = X86ISD::HADD;
9839     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9840                                    InVec1) &&
9841              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9842                                    InVec3) &&
9843              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9844              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9845       X86Opcode = X86ISD::HSUB;
9846     else
9847       CanFold = false;
9848 
9849     if (CanFold) {
9850       // Do not try to expand this build_vector into a pair of horizontal
9851       // add/sub if we can emit a pair of scalar add/sub.
9852       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9853         return SDValue();
9854 
9855       // Convert this build_vector into a pair of horizontal binops followed by
9856       // a concat vector. We must adjust the outputs from the partial horizontal
9857       // matching calls above to account for undefined vector halves.
9858       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9859       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9860       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9861       bool isUndefLO = NumUndefsLO == Half;
9862       bool isUndefHI = NumUndefsHI == Half;
9863       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9864                                    isUndefHI);
9865     }
9866   }
9867 
9868   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9869       VT == MVT::v16i16) {
9870     unsigned X86Opcode;
9871     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9872       X86Opcode = X86ISD::HADD;
9873     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9874                                    InVec1))
9875       X86Opcode = X86ISD::HSUB;
9876     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9877                                    InVec1))
9878       X86Opcode = X86ISD::FHADD;
9879     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9880                                    InVec1))
9881       X86Opcode = X86ISD::FHSUB;
9882     else
9883       return SDValue();
9884 
9885     // Don't try to expand this build_vector into a pair of horizontal add/sub
9886     // if we can simply emit a pair of scalar add/sub.
9887     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9888       return SDValue();
9889 
9890     // Convert this build_vector into two horizontal add/sub followed by
9891     // a concat vector.
9892     bool isUndefLO = NumUndefsLO == Half;
9893     bool isUndefHI = NumUndefsHI == Half;
9894     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9895                                  isUndefLO, isUndefHI);
9896   }
9897 
9898   return SDValue();
9899 }
9900 
9901 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9902                           SelectionDAG &DAG);
9903 
9904 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
9905 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9906 /// just apply the bit to the vectors.
9907 /// NOTE: Its not in our interest to start make a general purpose vectorizer
9908 /// from this, but enough scalar bit operations are created from the later
9909 /// legalization + scalarization stages to need basic support.
9910 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9911                                        const X86Subtarget &Subtarget,
9912                                        SelectionDAG &DAG) {
9913   SDLoc DL(Op);
9914   MVT VT = Op->getSimpleValueType(0);
9915   unsigned NumElems = VT.getVectorNumElements();
9916   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9917 
9918   // Check that all elements have the same opcode.
9919   // TODO: Should we allow UNDEFS and if so how many?
9920   unsigned Opcode = Op->getOperand(0).getOpcode();
9921   for (unsigned i = 1; i < NumElems; ++i)
9922     if (Opcode != Op->getOperand(i).getOpcode())
9923       return SDValue();
9924 
9925   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9926   bool IsShift = false;
9927   switch (Opcode) {
9928   default:
9929     return SDValue();
9930   case ISD::SHL:
9931   case ISD::SRL:
9932   case ISD::SRA:
9933     IsShift = true;
9934     break;
9935   case ISD::AND:
9936   case ISD::XOR:
9937   case ISD::OR:
9938     // Don't do this if the buildvector is a splat - we'd replace one
9939     // constant with an entire vector.
9940     if (Op->getSplatValue())
9941       return SDValue();
9942     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9943       return SDValue();
9944     break;
9945   }
9946 
9947   SmallVector<SDValue, 4> LHSElts, RHSElts;
9948   for (SDValue Elt : Op->ops()) {
9949     SDValue LHS = Elt.getOperand(0);
9950     SDValue RHS = Elt.getOperand(1);
9951 
9952     // We expect the canonicalized RHS operand to be the constant.
9953     if (!isa<ConstantSDNode>(RHS))
9954       return SDValue();
9955 
9956     // Extend shift amounts.
9957     if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9958       if (!IsShift)
9959         return SDValue();
9960       RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9961     }
9962 
9963     LHSElts.push_back(LHS);
9964     RHSElts.push_back(RHS);
9965   }
9966 
9967   // Limit to shifts by uniform immediates.
9968   // TODO: Only accept vXi8/vXi64 special cases?
9969   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9970   if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9971     return SDValue();
9972 
9973   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9974   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9975   SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9976 
9977   if (!IsShift)
9978     return Res;
9979 
9980   // Immediately lower the shift to ensure the constant build vector doesn't
9981   // get converted to a constant pool before the shift is lowered.
9982   return LowerShift(Res, Subtarget, DAG);
9983 }
9984 
9985 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
9986 /// functionality to do this, so it's all zeros, all ones, or some derivation
9987 /// that is cheap to calculate.
9988 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9989                                          const X86Subtarget &Subtarget) {
9990   SDLoc DL(Op);
9991   MVT VT = Op.getSimpleValueType();
9992 
9993   // Vectors containing all zeros can be matched by pxor and xorps.
9994   if (ISD::isBuildVectorAllZeros(Op.getNode()))
9995     return Op;
9996 
9997   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9998   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9999   // vpcmpeqd on 256-bit vectors.
10000   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10001     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10002       return Op;
10003 
10004     return getOnesVector(VT, DAG, DL);
10005   }
10006 
10007   return SDValue();
10008 }
10009 
10010 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10011 /// from a vector of source values and a vector of extraction indices.
10012 /// The vectors might be manipulated to match the type of the permute op.
10013 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10014                                      SDLoc &DL, SelectionDAG &DAG,
10015                                      const X86Subtarget &Subtarget) {
10016   MVT ShuffleVT = VT;
10017   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10018   unsigned NumElts = VT.getVectorNumElements();
10019   unsigned SizeInBits = VT.getSizeInBits();
10020 
10021   // Adjust IndicesVec to match VT size.
10022   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
10023          "Illegal variable permute mask size");
10024   if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10025     // Narrow/widen the indices vector to the correct size.
10026     if (IndicesVec.getValueSizeInBits() > SizeInBits)
10027       IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10028                                     NumElts * VT.getScalarSizeInBits());
10029     else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10030       IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10031                                   SDLoc(IndicesVec), SizeInBits);
10032     // Zero-extend the index elements within the vector.
10033     if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10034       IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10035                                IndicesVT, IndicesVec);
10036   }
10037   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10038 
10039   // Handle SrcVec that don't match VT type.
10040   if (SrcVec.getValueSizeInBits() != SizeInBits) {
10041     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10042       // Handle larger SrcVec by treating it as a larger permute.
10043       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10044       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10045       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10046       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10047                                   Subtarget, DAG, SDLoc(IndicesVec));
10048       SDValue NewSrcVec =
10049           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10050       if (NewSrcVec)
10051         return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10052       return SDValue();
10053     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10054       // Widen smaller SrcVec to match VT.
10055       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10056     } else
10057       return SDValue();
10058   }
10059 
10060   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10061     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
10062     EVT SrcVT = Idx.getValueType();
10063     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10064     uint64_t IndexScale = 0;
10065     uint64_t IndexOffset = 0;
10066 
10067     // If we're scaling a smaller permute op, then we need to repeat the
10068     // indices, scaling and offsetting them as well.
10069     // e.g. v4i32 -> v16i8 (Scale = 4)
10070     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10071     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10072     for (uint64_t i = 0; i != Scale; ++i) {
10073       IndexScale |= Scale << (i * NumDstBits);
10074       IndexOffset |= i << (i * NumDstBits);
10075     }
10076 
10077     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10078                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10079     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10080                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10081     return Idx;
10082   };
10083 
10084   unsigned Opcode = 0;
10085   switch (VT.SimpleTy) {
10086   default:
10087     break;
10088   case MVT::v16i8:
10089     if (Subtarget.hasSSSE3())
10090       Opcode = X86ISD::PSHUFB;
10091     break;
10092   case MVT::v8i16:
10093     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10094       Opcode = X86ISD::VPERMV;
10095     else if (Subtarget.hasSSSE3()) {
10096       Opcode = X86ISD::PSHUFB;
10097       ShuffleVT = MVT::v16i8;
10098     }
10099     break;
10100   case MVT::v4f32:
10101   case MVT::v4i32:
10102     if (Subtarget.hasAVX()) {
10103       Opcode = X86ISD::VPERMILPV;
10104       ShuffleVT = MVT::v4f32;
10105     } else if (Subtarget.hasSSSE3()) {
10106       Opcode = X86ISD::PSHUFB;
10107       ShuffleVT = MVT::v16i8;
10108     }
10109     break;
10110   case MVT::v2f64:
10111   case MVT::v2i64:
10112     if (Subtarget.hasAVX()) {
10113       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10114       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10115       Opcode = X86ISD::VPERMILPV;
10116       ShuffleVT = MVT::v2f64;
10117     } else if (Subtarget.hasSSE41()) {
10118       // SSE41 can compare v2i64 - select between indices 0 and 1.
10119       return DAG.getSelectCC(
10120           DL, IndicesVec,
10121           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10122           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10123           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10124           ISD::CondCode::SETEQ);
10125     }
10126     break;
10127   case MVT::v32i8:
10128     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10129       Opcode = X86ISD::VPERMV;
10130     else if (Subtarget.hasXOP()) {
10131       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10132       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10133       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10134       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10135       return DAG.getNode(
10136           ISD::CONCAT_VECTORS, DL, VT,
10137           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10138           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10139     } else if (Subtarget.hasAVX()) {
10140       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10141       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10142       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10143       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10144       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10145                               ArrayRef<SDValue> Ops) {
10146         // Permute Lo and Hi and then select based on index range.
10147         // This works as SHUFB uses bits[3:0] to permute elements and we don't
10148         // care about the bit[7] as its just an index vector.
10149         SDValue Idx = Ops[2];
10150         EVT VT = Idx.getValueType();
10151         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10152                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10153                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10154                                ISD::CondCode::SETGT);
10155       };
10156       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10157       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10158                               PSHUFBBuilder);
10159     }
10160     break;
10161   case MVT::v16i16:
10162     if (Subtarget.hasVLX() && Subtarget.hasBWI())
10163       Opcode = X86ISD::VPERMV;
10164     else if (Subtarget.hasAVX()) {
10165       // Scale to v32i8 and perform as v32i8.
10166       IndicesVec = ScaleIndices(IndicesVec, 2);
10167       return DAG.getBitcast(
10168           VT, createVariablePermute(
10169                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10170                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10171     }
10172     break;
10173   case MVT::v8f32:
10174   case MVT::v8i32:
10175     if (Subtarget.hasAVX2())
10176       Opcode = X86ISD::VPERMV;
10177     else if (Subtarget.hasAVX()) {
10178       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10179       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10180                                           {0, 1, 2, 3, 0, 1, 2, 3});
10181       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10182                                           {4, 5, 6, 7, 4, 5, 6, 7});
10183       if (Subtarget.hasXOP())
10184         return DAG.getBitcast(
10185             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10186                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10187       // Permute Lo and Hi and then select based on index range.
10188       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10189       SDValue Res = DAG.getSelectCC(
10190           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10191           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10192           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10193           ISD::CondCode::SETGT);
10194       return DAG.getBitcast(VT, Res);
10195     }
10196     break;
10197   case MVT::v4i64:
10198   case MVT::v4f64:
10199     if (Subtarget.hasAVX512()) {
10200       if (!Subtarget.hasVLX()) {
10201         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10202         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10203                                 SDLoc(SrcVec));
10204         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10205                                     DAG, SDLoc(IndicesVec));
10206         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10207                                             DAG, Subtarget);
10208         return extract256BitVector(Res, 0, DAG, DL);
10209       }
10210       Opcode = X86ISD::VPERMV;
10211     } else if (Subtarget.hasAVX()) {
10212       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10213       SDValue LoLo =
10214           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10215       SDValue HiHi =
10216           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10217       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10218       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10219       if (Subtarget.hasXOP())
10220         return DAG.getBitcast(
10221             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10222                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10223       // Permute Lo and Hi and then select based on index range.
10224       // This works as VPERMILPD only uses index bit[1] to permute elements.
10225       SDValue Res = DAG.getSelectCC(
10226           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10227           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10228           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10229           ISD::CondCode::SETGT);
10230       return DAG.getBitcast(VT, Res);
10231     }
10232     break;
10233   case MVT::v64i8:
10234     if (Subtarget.hasVBMI())
10235       Opcode = X86ISD::VPERMV;
10236     break;
10237   case MVT::v32i16:
10238     if (Subtarget.hasBWI())
10239       Opcode = X86ISD::VPERMV;
10240     break;
10241   case MVT::v16f32:
10242   case MVT::v16i32:
10243   case MVT::v8f64:
10244   case MVT::v8i64:
10245     if (Subtarget.hasAVX512())
10246       Opcode = X86ISD::VPERMV;
10247     break;
10248   }
10249   if (!Opcode)
10250     return SDValue();
10251 
10252   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
10253          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
10254          "Illegal variable permute shuffle type");
10255 
10256   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10257   if (Scale > 1)
10258     IndicesVec = ScaleIndices(IndicesVec, Scale);
10259 
10260   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10261   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10262 
10263   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10264   SDValue Res = Opcode == X86ISD::VPERMV
10265                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10266                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10267   return DAG.getBitcast(VT, Res);
10268 }
10269 
10270 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10271 // reasoned to be a permutation of a vector by indices in a non-constant vector.
10272 // (build_vector (extract_elt V, (extract_elt I, 0)),
10273 //               (extract_elt V, (extract_elt I, 1)),
10274 //                    ...
10275 // ->
10276 // (vpermv I, V)
10277 //
10278 // TODO: Handle undefs
10279 // TODO: Utilize pshufb and zero mask blending to support more efficient
10280 // construction of vectors with constant-0 elements.
10281 static SDValue
10282 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10283                                    const X86Subtarget &Subtarget) {
10284   SDValue SrcVec, IndicesVec;
10285   // Check for a match of the permute source vector and permute index elements.
10286   // This is done by checking that the i-th build_vector operand is of the form:
10287   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10288   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10289     SDValue Op = V.getOperand(Idx);
10290     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10291       return SDValue();
10292 
10293     // If this is the first extract encountered in V, set the source vector,
10294     // otherwise verify the extract is from the previously defined source
10295     // vector.
10296     if (!SrcVec)
10297       SrcVec = Op.getOperand(0);
10298     else if (SrcVec != Op.getOperand(0))
10299       return SDValue();
10300     SDValue ExtractedIndex = Op->getOperand(1);
10301     // Peek through extends.
10302     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10303         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10304       ExtractedIndex = ExtractedIndex.getOperand(0);
10305     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10306       return SDValue();
10307 
10308     // If this is the first extract from the index vector candidate, set the
10309     // indices vector, otherwise verify the extract is from the previously
10310     // defined indices vector.
10311     if (!IndicesVec)
10312       IndicesVec = ExtractedIndex.getOperand(0);
10313     else if (IndicesVec != ExtractedIndex.getOperand(0))
10314       return SDValue();
10315 
10316     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10317     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10318       return SDValue();
10319   }
10320 
10321   SDLoc DL(V);
10322   MVT VT = V.getSimpleValueType();
10323   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10324 }
10325 
10326 SDValue
10327 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10328   SDLoc dl(Op);
10329 
10330   MVT VT = Op.getSimpleValueType();
10331   MVT EltVT = VT.getVectorElementType();
10332   unsigned NumElems = Op.getNumOperands();
10333 
10334   // Generate vectors for predicate vectors.
10335   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10336     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10337 
10338   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10339     return VectorConstant;
10340 
10341   unsigned EVTBits = EltVT.getSizeInBits();
10342   APInt UndefMask = APInt::getNullValue(NumElems);
10343   APInt ZeroMask = APInt::getNullValue(NumElems);
10344   APInt NonZeroMask = APInt::getNullValue(NumElems);
10345   bool IsAllConstants = true;
10346   SmallSet<SDValue, 8> Values;
10347   unsigned NumConstants = NumElems;
10348   for (unsigned i = 0; i < NumElems; ++i) {
10349     SDValue Elt = Op.getOperand(i);
10350     if (Elt.isUndef()) {
10351       UndefMask.setBit(i);
10352       continue;
10353     }
10354     Values.insert(Elt);
10355     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10356       IsAllConstants = false;
10357       NumConstants--;
10358     }
10359     if (X86::isZeroNode(Elt)) {
10360       ZeroMask.setBit(i);
10361     } else {
10362       NonZeroMask.setBit(i);
10363     }
10364   }
10365 
10366   // All undef vector. Return an UNDEF. All zero vectors were handled above.
10367   if (NonZeroMask == 0) {
10368     assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
10369     return DAG.getUNDEF(VT);
10370   }
10371 
10372   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10373 
10374   // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10375   // lowering to a smaller build vector and padding with undef/zero.
10376   if ((VT.is256BitVector() || VT.is512BitVector()) &&
10377       !isFoldableUseOfShuffle(BV)) {
10378     unsigned UpperElems = NumElems / 2;
10379     APInt UndefOrZeroMask = UndefMask | ZeroMask;
10380     unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10381     if (NumUpperUndefsOrZeros >= UpperElems) {
10382       if (VT.is512BitVector() &&
10383           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10384         UpperElems = NumElems - (NumElems / 4);
10385       bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10386       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10387       SDValue NewBV =
10388           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10389       return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10390     }
10391   }
10392 
10393   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10394     return AddSub;
10395   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10396     return HorizontalOp;
10397   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10398     return Broadcast;
10399   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10400     return BitOp;
10401 
10402   unsigned NumZero = ZeroMask.countPopulation();
10403   unsigned NumNonZero = NonZeroMask.countPopulation();
10404 
10405   // If we are inserting one variable into a vector of non-zero constants, try
10406   // to avoid loading each constant element as a scalar. Load the constants as a
10407   // vector and then insert the variable scalar element. If insertion is not
10408   // supported, fall back to a shuffle to get the scalar blended with the
10409   // constants. Insertion into a zero vector is handled as a special-case
10410   // somewhere below here.
10411   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10412       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10413        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10414     // Create an all-constant vector. The variable element in the old
10415     // build vector is replaced by undef in the constant vector. Save the
10416     // variable scalar element and its index for use in the insertelement.
10417     LLVMContext &Context = *DAG.getContext();
10418     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10419     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10420     SDValue VarElt;
10421     SDValue InsIndex;
10422     for (unsigned i = 0; i != NumElems; ++i) {
10423       SDValue Elt = Op.getOperand(i);
10424       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10425         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10426       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10427         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10428       else if (!Elt.isUndef()) {
10429         assert(!VarElt.getNode() && !InsIndex.getNode() &&
10430                "Expected one variable element in this vector");
10431         VarElt = Elt;
10432         InsIndex = DAG.getVectorIdxConstant(i, dl);
10433       }
10434     }
10435     Constant *CV = ConstantVector::get(ConstVecOps);
10436     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10437 
10438     // The constants we just created may not be legal (eg, floating point). We
10439     // must lower the vector right here because we can not guarantee that we'll
10440     // legalize it before loading it. This is also why we could not just create
10441     // a new build vector here. If the build vector contains illegal constants,
10442     // it could get split back up into a series of insert elements.
10443     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10444     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10445     MachineFunction &MF = DAG.getMachineFunction();
10446     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10447     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10448     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10449     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10450     if (InsertC < NumEltsInLow128Bits)
10451       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10452 
10453     // There's no good way to insert into the high elements of a >128-bit
10454     // vector, so use shuffles to avoid an extract/insert sequence.
10455     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
10456     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
10457     SmallVector<int, 8> ShuffleMask;
10458     unsigned NumElts = VT.getVectorNumElements();
10459     for (unsigned i = 0; i != NumElts; ++i)
10460       ShuffleMask.push_back(i == InsertC ? NumElts : i);
10461     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10462     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10463   }
10464 
10465   // Special case for single non-zero, non-undef, element.
10466   if (NumNonZero == 1) {
10467     unsigned Idx = NonZeroMask.countTrailingZeros();
10468     SDValue Item = Op.getOperand(Idx);
10469 
10470     // If we have a constant or non-constant insertion into the low element of
10471     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10472     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
10473     // depending on what the source datatype is.
10474     if (Idx == 0) {
10475       if (NumZero == 0)
10476         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10477 
10478       if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10479           (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10480         assert((VT.is128BitVector() || VT.is256BitVector() ||
10481                 VT.is512BitVector()) &&
10482                "Expected an SSE value type!");
10483         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10484         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10485         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10486       }
10487 
10488       // We can't directly insert an i8 or i16 into a vector, so zero extend
10489       // it to i32 first.
10490       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10491         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10492         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10493         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10494         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10495         return DAG.getBitcast(VT, Item);
10496       }
10497     }
10498 
10499     // Is it a vector logical left shift?
10500     if (NumElems == 2 && Idx == 1 &&
10501         X86::isZeroNode(Op.getOperand(0)) &&
10502         !X86::isZeroNode(Op.getOperand(1))) {
10503       unsigned NumBits = VT.getSizeInBits();
10504       return getVShift(true, VT,
10505                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10506                                    VT, Op.getOperand(1)),
10507                        NumBits/2, DAG, *this, dl);
10508     }
10509 
10510     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10511       return SDValue();
10512 
10513     // Otherwise, if this is a vector with i32 or f32 elements, and the element
10514     // is a non-constant being inserted into an element other than the low one,
10515     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
10516     // movd/movss) to move this into the low element, then shuffle it into
10517     // place.
10518     if (EVTBits == 32) {
10519       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10520       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10521     }
10522   }
10523 
10524   // Splat is obviously ok. Let legalizer expand it to a shuffle.
10525   if (Values.size() == 1) {
10526     if (EVTBits == 32) {
10527       // Instead of a shuffle like this:
10528       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10529       // Check if it's possible to issue this instead.
10530       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10531       unsigned Idx = NonZeroMask.countTrailingZeros();
10532       SDValue Item = Op.getOperand(Idx);
10533       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10534         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10535     }
10536     return SDValue();
10537   }
10538 
10539   // A vector full of immediates; various special cases are already
10540   // handled, so this is best done with a single constant-pool load.
10541   if (IsAllConstants)
10542     return SDValue();
10543 
10544   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10545       return V;
10546 
10547   // See if we can use a vector load to get all of the elements.
10548   {
10549     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10550     if (SDValue LD =
10551             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10552       return LD;
10553   }
10554 
10555   // If this is a splat of pairs of 32-bit elements, we can use a narrower
10556   // build_vector and broadcast it.
10557   // TODO: We could probably generalize this more.
10558   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10559     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10560                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10561     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10562       // Make sure all the even/odd operands match.
10563       for (unsigned i = 2; i != NumElems; ++i)
10564         if (Ops[i % 2] != Op.getOperand(i))
10565           return false;
10566       return true;
10567     };
10568     if (CanSplat(Op, NumElems, Ops)) {
10569       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10570       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10571       // Create a new build vector and cast to v2i64/v2f64.
10572       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10573                                      DAG.getBuildVector(NarrowVT, dl, Ops));
10574       // Broadcast from v2i64/v2f64 and cast to final VT.
10575       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10576       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10577                                             NewBV));
10578     }
10579   }
10580 
10581   // For AVX-length vectors, build the individual 128-bit pieces and use
10582   // shuffles to put them in place.
10583   if (VT.getSizeInBits() > 128) {
10584     MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10585 
10586     // Build both the lower and upper subvector.
10587     SDValue Lower =
10588         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10589     SDValue Upper = DAG.getBuildVector(
10590         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10591 
10592     // Recreate the wider vector with the lower and upper part.
10593     return concatSubVectors(Lower, Upper, DAG, dl);
10594   }
10595 
10596   // Let legalizer expand 2-wide build_vectors.
10597   if (EVTBits == 64) {
10598     if (NumNonZero == 1) {
10599       // One half is zero or undef.
10600       unsigned Idx = NonZeroMask.countTrailingZeros();
10601       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10602                                Op.getOperand(Idx));
10603       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10604     }
10605     return SDValue();
10606   }
10607 
10608   // If element VT is < 32 bits, convert it to inserts into a zero vector.
10609   if (EVTBits == 8 && NumElems == 16)
10610     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10611                                           DAG, Subtarget))
10612       return V;
10613 
10614   if (EVTBits == 16 && NumElems == 8)
10615     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10616                                           DAG, Subtarget))
10617       return V;
10618 
10619   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10620   if (EVTBits == 32 && NumElems == 4)
10621     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10622       return V;
10623 
10624   // If element VT is == 32 bits, turn it into a number of shuffles.
10625   if (NumElems == 4 && NumZero > 0) {
10626     SmallVector<SDValue, 8> Ops(NumElems);
10627     for (unsigned i = 0; i < 4; ++i) {
10628       bool isZero = !NonZeroMask[i];
10629       if (isZero)
10630         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10631       else
10632         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10633     }
10634 
10635     for (unsigned i = 0; i < 2; ++i) {
10636       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10637         default: llvm_unreachable("Unexpected NonZero count");
10638         case 0:
10639           Ops[i] = Ops[i*2];  // Must be a zero vector.
10640           break;
10641         case 1:
10642           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10643           break;
10644         case 2:
10645           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10646           break;
10647         case 3:
10648           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10649           break;
10650       }
10651     }
10652 
10653     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10654     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10655     int MaskVec[] = {
10656       Reverse1 ? 1 : 0,
10657       Reverse1 ? 0 : 1,
10658       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10659       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
10660     };
10661     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10662   }
10663 
10664   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10665 
10666   // Check for a build vector from mostly shuffle plus few inserting.
10667   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10668     return Sh;
10669 
10670   // For SSE 4.1, use insertps to put the high elements into the low element.
10671   if (Subtarget.hasSSE41()) {
10672     SDValue Result;
10673     if (!Op.getOperand(0).isUndef())
10674       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10675     else
10676       Result = DAG.getUNDEF(VT);
10677 
10678     for (unsigned i = 1; i < NumElems; ++i) {
10679       if (Op.getOperand(i).isUndef()) continue;
10680       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10681                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10682     }
10683     return Result;
10684   }
10685 
10686   // Otherwise, expand into a number of unpckl*, start by extending each of
10687   // our (non-undef) elements to the full vector width with the element in the
10688   // bottom slot of the vector (which generates no code for SSE).
10689   SmallVector<SDValue, 8> Ops(NumElems);
10690   for (unsigned i = 0; i < NumElems; ++i) {
10691     if (!Op.getOperand(i).isUndef())
10692       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10693     else
10694       Ops[i] = DAG.getUNDEF(VT);
10695   }
10696 
10697   // Next, we iteratively mix elements, e.g. for v4f32:
10698   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10699   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10700   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
10701   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10702     // Generate scaled UNPCKL shuffle mask.
10703     SmallVector<int, 16> Mask;
10704     for(unsigned i = 0; i != Scale; ++i)
10705       Mask.push_back(i);
10706     for (unsigned i = 0; i != Scale; ++i)
10707       Mask.push_back(NumElems+i);
10708     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10709 
10710     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10711       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10712   }
10713   return Ops[0];
10714 }
10715 
10716 // 256-bit AVX can use the vinsertf128 instruction
10717 // to create 256-bit vectors from two other 128-bit ones.
10718 // TODO: Detect subvector broadcast here instead of DAG combine?
10719 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10720                                       const X86Subtarget &Subtarget) {
10721   SDLoc dl(Op);
10722   MVT ResVT = Op.getSimpleValueType();
10723 
10724   assert((ResVT.is256BitVector() ||
10725           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
10726 
10727   unsigned NumOperands = Op.getNumOperands();
10728   unsigned NumZero = 0;
10729   unsigned NumNonZero = 0;
10730   unsigned NonZeros = 0;
10731   for (unsigned i = 0; i != NumOperands; ++i) {
10732     SDValue SubVec = Op.getOperand(i);
10733     if (SubVec.isUndef())
10734       continue;
10735     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10736       ++NumZero;
10737     else {
10738       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10739       NonZeros |= 1 << i;
10740       ++NumNonZero;
10741     }
10742   }
10743 
10744   // If we have more than 2 non-zeros, build each half separately.
10745   if (NumNonZero > 2) {
10746     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10747     ArrayRef<SDUse> Ops = Op->ops();
10748     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10749                              Ops.slice(0, NumOperands/2));
10750     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10751                              Ops.slice(NumOperands/2));
10752     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10753   }
10754 
10755   // Otherwise, build it up through insert_subvectors.
10756   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10757                         : DAG.getUNDEF(ResVT);
10758 
10759   MVT SubVT = Op.getOperand(0).getSimpleValueType();
10760   unsigned NumSubElems = SubVT.getVectorNumElements();
10761   for (unsigned i = 0; i != NumOperands; ++i) {
10762     if ((NonZeros & (1 << i)) == 0)
10763       continue;
10764 
10765     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10766                       Op.getOperand(i),
10767                       DAG.getIntPtrConstant(i * NumSubElems, dl));
10768   }
10769 
10770   return Vec;
10771 }
10772 
10773 // Returns true if the given node is a type promotion (by concatenating i1
10774 // zeros) of the result of a node that already zeros all upper bits of
10775 // k-register.
10776 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
10777 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10778                                        const X86Subtarget &Subtarget,
10779                                        SelectionDAG & DAG) {
10780   SDLoc dl(Op);
10781   MVT ResVT = Op.getSimpleValueType();
10782   unsigned NumOperands = Op.getNumOperands();
10783 
10784   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10785          "Unexpected number of operands in CONCAT_VECTORS");
10786 
10787   uint64_t Zeros = 0;
10788   uint64_t NonZeros = 0;
10789   for (unsigned i = 0; i != NumOperands; ++i) {
10790     SDValue SubVec = Op.getOperand(i);
10791     if (SubVec.isUndef())
10792       continue;
10793     assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10794     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10795       Zeros |= (uint64_t)1 << i;
10796     else
10797       NonZeros |= (uint64_t)1 << i;
10798   }
10799 
10800   unsigned NumElems = ResVT.getVectorNumElements();
10801 
10802   // If we are inserting non-zero vector and there are zeros in LSBs and undef
10803   // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10804   // insert_subvector will give us two kshifts.
10805   if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10806       Log2_64(NonZeros) != NumOperands - 1) {
10807     MVT ShiftVT = ResVT;
10808     if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10809       ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10810     unsigned Idx = Log2_64(NonZeros);
10811     SDValue SubVec = Op.getOperand(Idx);
10812     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10813     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10814                          DAG.getUNDEF(ShiftVT), SubVec,
10815                          DAG.getIntPtrConstant(0, dl));
10816     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10817                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10818     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10819                        DAG.getIntPtrConstant(0, dl));
10820   }
10821 
10822   // If there are zero or one non-zeros we can handle this very simply.
10823   if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10824     SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10825     if (!NonZeros)
10826       return Vec;
10827     unsigned Idx = Log2_64(NonZeros);
10828     SDValue SubVec = Op.getOperand(Idx);
10829     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10830     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10831                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10832   }
10833 
10834   if (NumOperands > 2) {
10835     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10836     ArrayRef<SDUse> Ops = Op->ops();
10837     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10838                              Ops.slice(0, NumOperands/2));
10839     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10840                              Ops.slice(NumOperands/2));
10841     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10842   }
10843 
10844   assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
10845 
10846   if (ResVT.getVectorNumElements() >= 16)
10847     return Op; // The operation is legal with KUNPCK
10848 
10849   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10850                             DAG.getUNDEF(ResVT), Op.getOperand(0),
10851                             DAG.getIntPtrConstant(0, dl));
10852   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10853                      DAG.getIntPtrConstant(NumElems/2, dl));
10854 }
10855 
10856 static SDValue LowerCONCAT_VECTORS(SDValue Op,
10857                                    const X86Subtarget &Subtarget,
10858                                    SelectionDAG &DAG) {
10859   MVT VT = Op.getSimpleValueType();
10860   if (VT.getVectorElementType() == MVT::i1)
10861     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10862 
10863   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10864          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
10865           Op.getNumOperands() == 4)));
10866 
10867   // AVX can use the vinsertf128 instruction to create 256-bit vectors
10868   // from two other 128-bit ones.
10869 
10870   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10871   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10872 }
10873 
10874 //===----------------------------------------------------------------------===//
10875 // Vector shuffle lowering
10876 //
10877 // This is an experimental code path for lowering vector shuffles on x86. It is
10878 // designed to handle arbitrary vector shuffles and blends, gracefully
10879 // degrading performance as necessary. It works hard to recognize idiomatic
10880 // shuffles and lower them to optimal instruction patterns without leaving
10881 // a framework that allows reasonably efficient handling of all vector shuffle
10882 // patterns.
10883 //===----------------------------------------------------------------------===//
10884 
10885 /// Tiny helper function to identify a no-op mask.
10886 ///
10887 /// This is a somewhat boring predicate function. It checks whether the mask
10888 /// array input, which is assumed to be a single-input shuffle mask of the kind
10889 /// used by the X86 shuffle instructions (not a fully general
10890 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10891 /// in-place shuffle are 'no-op's.
10892 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10893   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10894     assert(Mask[i] >= -1 && "Out of bound mask element!");
10895     if (Mask[i] >= 0 && Mask[i] != i)
10896       return false;
10897   }
10898   return true;
10899 }
10900 
10901 /// Test whether there are elements crossing LaneSizeInBits lanes in this
10902 /// shuffle mask.
10903 ///
10904 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10905 /// and we routinely test for these.
10906 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10907                                       unsigned ScalarSizeInBits,
10908                                       ArrayRef<int> Mask) {
10909   assert(LaneSizeInBits && ScalarSizeInBits &&
10910          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10911          "Illegal shuffle lane size");
10912   int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10913   int Size = Mask.size();
10914   for (int i = 0; i < Size; ++i)
10915     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10916       return true;
10917   return false;
10918 }
10919 
10920 /// Test whether there are elements crossing 128-bit lanes in this
10921 /// shuffle mask.
10922 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10923   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10924 }
10925 
10926 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10927 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10928 /// better support 'repeated mask + lane permute' style shuffles.
10929 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10930                                    unsigned ScalarSizeInBits,
10931                                    ArrayRef<int> Mask) {
10932   assert(LaneSizeInBits && ScalarSizeInBits &&
10933          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10934          "Illegal shuffle lane size");
10935   int NumElts = Mask.size();
10936   int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10937   int NumLanes = NumElts / NumEltsPerLane;
10938   if (NumLanes > 1) {
10939     for (int i = 0; i != NumLanes; ++i) {
10940       int SrcLane = -1;
10941       for (int j = 0; j != NumEltsPerLane; ++j) {
10942         int M = Mask[(i * NumEltsPerLane) + j];
10943         if (M < 0)
10944           continue;
10945         int Lane = (M % NumElts) / NumEltsPerLane;
10946         if (SrcLane >= 0 && SrcLane != Lane)
10947           return true;
10948         SrcLane = Lane;
10949       }
10950     }
10951   }
10952   return false;
10953 }
10954 
10955 /// Test whether a shuffle mask is equivalent within each sub-lane.
10956 ///
10957 /// This checks a shuffle mask to see if it is performing the same
10958 /// lane-relative shuffle in each sub-lane. This trivially implies
10959 /// that it is also not lane-crossing. It may however involve a blend from the
10960 /// same lane of a second vector.
10961 ///
10962 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10963 /// non-trivial to compute in the face of undef lanes. The representation is
10964 /// suitable for use with existing 128-bit shuffles as entries from the second
10965 /// vector have been remapped to [LaneSize, 2*LaneSize).
10966 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10967                                   ArrayRef<int> Mask,
10968                                   SmallVectorImpl<int> &RepeatedMask) {
10969   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10970   RepeatedMask.assign(LaneSize, -1);
10971   int Size = Mask.size();
10972   for (int i = 0; i < Size; ++i) {
10973     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10974     if (Mask[i] < 0)
10975       continue;
10976     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10977       // This entry crosses lanes, so there is no way to model this shuffle.
10978       return false;
10979 
10980     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10981     // Adjust second vector indices to start at LaneSize instead of Size.
10982     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10983                                 : Mask[i] % LaneSize + LaneSize;
10984     if (RepeatedMask[i % LaneSize] < 0)
10985       // This is the first non-undef entry in this slot of a 128-bit lane.
10986       RepeatedMask[i % LaneSize] = LocalM;
10987     else if (RepeatedMask[i % LaneSize] != LocalM)
10988       // Found a mismatch with the repeated mask.
10989       return false;
10990   }
10991   return true;
10992 }
10993 
10994 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
10995 static bool
10996 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10997                                 SmallVectorImpl<int> &RepeatedMask) {
10998   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10999 }
11000 
11001 static bool
11002 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11003   SmallVector<int, 32> RepeatedMask;
11004   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11005 }
11006 
11007 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
11008 static bool
11009 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11010                                 SmallVectorImpl<int> &RepeatedMask) {
11011   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11012 }
11013 
11014 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11015 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11016 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11017                                         unsigned EltSizeInBits,
11018                                         ArrayRef<int> Mask,
11019                                         SmallVectorImpl<int> &RepeatedMask) {
11020   int LaneSize = LaneSizeInBits / EltSizeInBits;
11021   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11022   int Size = Mask.size();
11023   for (int i = 0; i < Size; ++i) {
11024     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
11025     if (Mask[i] == SM_SentinelUndef)
11026       continue;
11027     if (Mask[i] == SM_SentinelZero) {
11028       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11029         return false;
11030       RepeatedMask[i % LaneSize] = SM_SentinelZero;
11031       continue;
11032     }
11033     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11034       // This entry crosses lanes, so there is no way to model this shuffle.
11035       return false;
11036 
11037     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11038     // later vector indices to start at multiples of LaneSize instead of Size.
11039     int LaneM = Mask[i] / Size;
11040     int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11041     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11042       // This is the first non-undef entry in this slot of a 128-bit lane.
11043       RepeatedMask[i % LaneSize] = LocalM;
11044     else if (RepeatedMask[i % LaneSize] != LocalM)
11045       // Found a mismatch with the repeated mask.
11046       return false;
11047   }
11048   return true;
11049 }
11050 
11051 /// Test whether a target shuffle mask is equivalent within each sub-lane.
11052 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11053 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11054                                         ArrayRef<int> Mask,
11055                                         SmallVectorImpl<int> &RepeatedMask) {
11056   return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11057                                      Mask, RepeatedMask);
11058 }
11059 
11060 /// Checks whether the vector elements referenced by two shuffle masks are
11061 /// equivalent.
11062 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11063                                 int Idx, int ExpectedIdx) {
11064   assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
11065          ExpectedIdx < MaskSize && "Out of range element index");
11066   if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11067     return false;
11068 
11069   switch (Op.getOpcode()) {
11070   case ISD::BUILD_VECTOR:
11071     // If the values are build vectors, we can look through them to find
11072     // equivalent inputs that make the shuffles equivalent.
11073     // TODO: Handle MaskSize != Op.getNumOperands()?
11074     if (MaskSize == (int)Op.getNumOperands() &&
11075         MaskSize == (int)ExpectedOp.getNumOperands())
11076       return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11077     break;
11078   case X86ISD::VBROADCAST:
11079   case X86ISD::VBROADCAST_LOAD:
11080     // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11081     return (Op == ExpectedOp &&
11082             (int)Op.getValueType().getVectorNumElements() == MaskSize);
11083   case X86ISD::HADD:
11084   case X86ISD::HSUB:
11085   case X86ISD::FHADD:
11086   case X86ISD::FHSUB:
11087   case X86ISD::PACKSS:
11088   case X86ISD::PACKUS:
11089     // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11090     // TODO: Handle MaskSize != NumElts?
11091     // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11092     if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11093       MVT VT = Op.getSimpleValueType();
11094       int NumElts = VT.getVectorNumElements();
11095       if (MaskSize == NumElts) {
11096         int NumLanes = VT.getSizeInBits() / 128;
11097         int NumEltsPerLane = NumElts / NumLanes;
11098         int NumHalfEltsPerLane = NumEltsPerLane / 2;
11099         bool SameLane =
11100             (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11101         bool SameElt =
11102             (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11103         return SameLane && SameElt;
11104       }
11105     }
11106     break;
11107   }
11108 
11109   return false;
11110 }
11111 
11112 /// Checks whether a shuffle mask is equivalent to an explicit list of
11113 /// arguments.
11114 ///
11115 /// This is a fast way to test a shuffle mask against a fixed pattern:
11116 ///
11117 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11118 ///
11119 /// It returns true if the mask is exactly as wide as the argument list, and
11120 /// each element of the mask is either -1 (signifying undef) or the value given
11121 /// in the argument.
11122 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11123                                 SDValue V1 = SDValue(),
11124                                 SDValue V2 = SDValue()) {
11125   int Size = Mask.size();
11126   if (Size != (int)ExpectedMask.size())
11127     return false;
11128 
11129   for (int i = 0; i < Size; ++i) {
11130     assert(Mask[i] >= -1 && "Out of bound mask element!");
11131     int MaskIdx = Mask[i];
11132     int ExpectedIdx = ExpectedMask[i];
11133     if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11134       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11135       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11136       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11137       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11138       if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11139         return false;
11140     }
11141   }
11142   return true;
11143 }
11144 
11145 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11146 ///
11147 /// The masks must be exactly the same width.
11148 ///
11149 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11150 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
11151 ///
11152 /// SM_SentinelZero is accepted as a valid negative index but must match in
11153 /// both.
11154 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11155                                       ArrayRef<int> ExpectedMask,
11156                                       SDValue V1 = SDValue(),
11157                                       SDValue V2 = SDValue()) {
11158   int Size = Mask.size();
11159   if (Size != (int)ExpectedMask.size())
11160     return false;
11161   assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
11162          "Illegal target shuffle mask");
11163 
11164   // Check for out-of-range target shuffle mask indices.
11165   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11166     return false;
11167 
11168   // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11169   if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11170     V1 = SDValue();
11171   if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11172     V2 = SDValue();
11173 
11174   for (int i = 0; i < Size; ++i) {
11175     int MaskIdx = Mask[i];
11176     int ExpectedIdx = ExpectedMask[i];
11177     if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11178       continue;
11179     if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11180       SDValue MaskV = MaskIdx < Size ? V1 : V2;
11181       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11182       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11183       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11184       if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11185         continue;
11186     }
11187     // TODO - handle SM_Sentinel equivalences.
11188     return false;
11189   }
11190   return true;
11191 }
11192 
11193 // Attempt to create a shuffle mask from a VSELECT condition mask.
11194 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11195                                          SDValue Cond) {
11196   EVT CondVT = Cond.getValueType();
11197   unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11198   unsigned NumElts = CondVT.getVectorNumElements();
11199 
11200   APInt UndefElts;
11201   SmallVector<APInt, 32> EltBits;
11202   if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11203                                      true, false))
11204     return false;
11205 
11206   Mask.resize(NumElts, SM_SentinelUndef);
11207 
11208   for (int i = 0; i != (int)NumElts; ++i) {
11209     Mask[i] = i;
11210     // Arbitrarily choose from the 2nd operand if the select condition element
11211     // is undef.
11212     // TODO: Can we do better by matching patterns such as even/odd?
11213     if (UndefElts[i] || EltBits[i].isNullValue())
11214       Mask[i] += NumElts;
11215   }
11216 
11217   return true;
11218 }
11219 
11220 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11221 // instructions.
11222 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11223   if (VT != MVT::v8i32 && VT != MVT::v8f32)
11224     return false;
11225 
11226   SmallVector<int, 8> Unpcklwd;
11227   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11228                           /* Unary = */ false);
11229   SmallVector<int, 8> Unpckhwd;
11230   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11231                           /* Unary = */ false);
11232   bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11233                          isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11234   return IsUnpackwdMask;
11235 }
11236 
11237 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11238   // Create 128-bit vector type based on mask size.
11239   MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11240   MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11241 
11242   // We can't assume a canonical shuffle mask, so try the commuted version too.
11243   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11244   ShuffleVectorSDNode::commuteMask(CommutedMask);
11245 
11246   // Match any of unary/binary or low/high.
11247   for (unsigned i = 0; i != 4; ++i) {
11248     SmallVector<int, 16> UnpackMask;
11249     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11250     if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11251         isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11252       return true;
11253   }
11254   return false;
11255 }
11256 
11257 /// Return true if a shuffle mask chooses elements identically in its top and
11258 /// bottom halves. For example, any splat mask has the same top and bottom
11259 /// halves. If an element is undefined in only one half of the mask, the halves
11260 /// are not considered identical.
11261 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11262   assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
11263   unsigned HalfSize = Mask.size() / 2;
11264   for (unsigned i = 0; i != HalfSize; ++i) {
11265     if (Mask[i] != Mask[i + HalfSize])
11266       return false;
11267   }
11268   return true;
11269 }
11270 
11271 /// Get a 4-lane 8-bit shuffle immediate for a mask.
11272 ///
11273 /// This helper function produces an 8-bit shuffle immediate corresponding to
11274 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
11275 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11276 /// example.
11277 ///
11278 /// NB: We rely heavily on "undef" masks preserving the input lane.
11279 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11280   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
11281   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
11282   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
11283   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
11284   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
11285 
11286   // If the mask only uses one non-undef element, then fully 'splat' it to
11287   // improve later broadcast matching.
11288   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11289   assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
11290 
11291   int FirstElt = Mask[FirstIndex];
11292   if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11293     return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11294 
11295   unsigned Imm = 0;
11296   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11297   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11298   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11299   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11300   return Imm;
11301 }
11302 
11303 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11304                                           SelectionDAG &DAG) {
11305   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11306 }
11307 
11308 // The Shuffle result is as follow:
11309 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11310 // Each Zeroable's element correspond to a particular Mask's element.
11311 // As described in computeZeroableShuffleElements function.
11312 //
11313 // The function looks for a sub-mask that the nonzero elements are in
11314 // increasing order. If such sub-mask exist. The function returns true.
11315 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11316                                      ArrayRef<int> Mask, const EVT &VectorType,
11317                                      bool &IsZeroSideLeft) {
11318   int NextElement = -1;
11319   // Check if the Mask's nonzero elements are in increasing order.
11320   for (int i = 0, e = Mask.size(); i < e; i++) {
11321     // Checks if the mask's zeros elements are built from only zeros.
11322     assert(Mask[i] >= -1 && "Out of bound mask element!");
11323     if (Mask[i] < 0)
11324       return false;
11325     if (Zeroable[i])
11326       continue;
11327     // Find the lowest non zero element
11328     if (NextElement < 0) {
11329       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11330       IsZeroSideLeft = NextElement != 0;
11331     }
11332     // Exit if the mask's non zero elements are not in increasing order.
11333     if (NextElement != Mask[i])
11334       return false;
11335     NextElement++;
11336   }
11337   return true;
11338 }
11339 
11340 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11341 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11342                                       ArrayRef<int> Mask, SDValue V1,
11343                                       SDValue V2, const APInt &Zeroable,
11344                                       const X86Subtarget &Subtarget,
11345                                       SelectionDAG &DAG) {
11346   int Size = Mask.size();
11347   int LaneSize = 128 / VT.getScalarSizeInBits();
11348   const int NumBytes = VT.getSizeInBits() / 8;
11349   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11350 
11351   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
11352          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
11353          (Subtarget.hasBWI() && VT.is512BitVector()));
11354 
11355   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11356   // Sign bit set in i8 mask means zero element.
11357   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11358 
11359   SDValue V;
11360   for (int i = 0; i < NumBytes; ++i) {
11361     int M = Mask[i / NumEltBytes];
11362     if (M < 0) {
11363       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11364       continue;
11365     }
11366     if (Zeroable[i / NumEltBytes]) {
11367       PSHUFBMask[i] = ZeroMask;
11368       continue;
11369     }
11370 
11371     // We can only use a single input of V1 or V2.
11372     SDValue SrcV = (M >= Size ? V2 : V1);
11373     if (V && V != SrcV)
11374       return SDValue();
11375     V = SrcV;
11376     M %= Size;
11377 
11378     // PSHUFB can't cross lanes, ensure this doesn't happen.
11379     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11380       return SDValue();
11381 
11382     M = M % LaneSize;
11383     M = M * NumEltBytes + (i % NumEltBytes);
11384     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11385   }
11386   assert(V && "Failed to find a source input");
11387 
11388   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11389   return DAG.getBitcast(
11390       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11391                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11392 }
11393 
11394 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11395                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
11396                            const SDLoc &dl);
11397 
11398 // X86 has dedicated shuffle that can be lowered to VEXPAND
11399 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11400                                     const APInt &Zeroable,
11401                                     ArrayRef<int> Mask, SDValue &V1,
11402                                     SDValue &V2, SelectionDAG &DAG,
11403                                     const X86Subtarget &Subtarget) {
11404   bool IsLeftZeroSide = true;
11405   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11406                                 IsLeftZeroSide))
11407     return SDValue();
11408   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11409   MVT IntegerType =
11410       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11411   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11412   unsigned NumElts = VT.getVectorNumElements();
11413   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11414          "Unexpected number of vector elements");
11415   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11416                               Subtarget, DAG, DL);
11417   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11418   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11419   return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11420 }
11421 
11422 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11423                                   unsigned &UnpackOpcode, bool IsUnary,
11424                                   ArrayRef<int> TargetMask, const SDLoc &DL,
11425                                   SelectionDAG &DAG,
11426                                   const X86Subtarget &Subtarget) {
11427   int NumElts = VT.getVectorNumElements();
11428 
11429   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11430   for (int i = 0; i != NumElts; i += 2) {
11431     int M1 = TargetMask[i + 0];
11432     int M2 = TargetMask[i + 1];
11433     Undef1 &= (SM_SentinelUndef == M1);
11434     Undef2 &= (SM_SentinelUndef == M2);
11435     Zero1 &= isUndefOrZero(M1);
11436     Zero2 &= isUndefOrZero(M2);
11437   }
11438   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11439          "Zeroable shuffle detected");
11440 
11441   // Attempt to match the target mask against the unpack lo/hi mask patterns.
11442   SmallVector<int, 64> Unpckl, Unpckh;
11443   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11444   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11445                                 (IsUnary ? V1 : V2))) {
11446     UnpackOpcode = X86ISD::UNPCKL;
11447     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11448     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11449     return true;
11450   }
11451 
11452   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11453   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11454                                 (IsUnary ? V1 : V2))) {
11455     UnpackOpcode = X86ISD::UNPCKH;
11456     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11457     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11458     return true;
11459   }
11460 
11461   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11462   if (IsUnary && (Zero1 || Zero2)) {
11463     // Don't bother if we can blend instead.
11464     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11465         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11466       return false;
11467 
11468     bool MatchLo = true, MatchHi = true;
11469     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11470       int M = TargetMask[i];
11471 
11472       // Ignore if the input is known to be zero or the index is undef.
11473       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11474           (M == SM_SentinelUndef))
11475         continue;
11476 
11477       MatchLo &= (M == Unpckl[i]);
11478       MatchHi &= (M == Unpckh[i]);
11479     }
11480 
11481     if (MatchLo || MatchHi) {
11482       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11483       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11484       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11485       return true;
11486     }
11487   }
11488 
11489   // If a binary shuffle, commute and try again.
11490   if (!IsUnary) {
11491     ShuffleVectorSDNode::commuteMask(Unpckl);
11492     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11493       UnpackOpcode = X86ISD::UNPCKL;
11494       std::swap(V1, V2);
11495       return true;
11496     }
11497 
11498     ShuffleVectorSDNode::commuteMask(Unpckh);
11499     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11500       UnpackOpcode = X86ISD::UNPCKH;
11501       std::swap(V1, V2);
11502       return true;
11503     }
11504   }
11505 
11506   return false;
11507 }
11508 
11509 // X86 has dedicated unpack instructions that can handle specific blend
11510 // operations: UNPCKH and UNPCKL.
11511 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11512                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
11513                                      SelectionDAG &DAG) {
11514   SmallVector<int, 8> Unpckl;
11515   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11516   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11517     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11518 
11519   SmallVector<int, 8> Unpckh;
11520   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11521   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11522     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11523 
11524   // Commute and try again.
11525   ShuffleVectorSDNode::commuteMask(Unpckl);
11526   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11527     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11528 
11529   ShuffleVectorSDNode::commuteMask(Unpckh);
11530   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11531     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11532 
11533   return SDValue();
11534 }
11535 
11536 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11537 /// followed by unpack 256-bit.
11538 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11539                                         ArrayRef<int> Mask, SDValue V1,
11540                                         SDValue V2, SelectionDAG &DAG) {
11541   SmallVector<int, 32> Unpckl, Unpckh;
11542   createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11543   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11544 
11545   unsigned UnpackOpcode;
11546   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11547     UnpackOpcode = X86ISD::UNPCKL;
11548   else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11549     UnpackOpcode = X86ISD::UNPCKH;
11550   else
11551     return SDValue();
11552 
11553   // This is a "natural" unpack operation (rather than the 128-bit sectored
11554   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11555   // input in order to use the x86 instruction.
11556   V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11557                             DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11558   V1 = DAG.getBitcast(VT, V1);
11559   return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11560 }
11561 
11562 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11563 // source into the lower elements and zeroing the upper elements.
11564 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11565                                  ArrayRef<int> Mask, const APInt &Zeroable,
11566                                  const X86Subtarget &Subtarget) {
11567   if (!VT.is512BitVector() && !Subtarget.hasVLX())
11568     return false;
11569 
11570   unsigned NumElts = Mask.size();
11571   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11572   unsigned MaxScale = 64 / EltSizeInBits;
11573 
11574   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11575     unsigned SrcEltBits = EltSizeInBits * Scale;
11576     if (SrcEltBits < 32 && !Subtarget.hasBWI())
11577       continue;
11578     unsigned NumSrcElts = NumElts / Scale;
11579     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11580       continue;
11581     unsigned UpperElts = NumElts - NumSrcElts;
11582     if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11583       continue;
11584     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11585     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11586     DstVT = MVT::getIntegerVT(EltSizeInBits);
11587     if ((NumSrcElts * EltSizeInBits) >= 128) {
11588       // ISD::TRUNCATE
11589       DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11590     } else {
11591       // X86ISD::VTRUNC
11592       DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11593     }
11594     return true;
11595   }
11596 
11597   return false;
11598 }
11599 
11600 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11601 // element padding to the final DstVT.
11602 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11603                                   const X86Subtarget &Subtarget,
11604                                   SelectionDAG &DAG, bool ZeroUppers) {
11605   MVT SrcVT = Src.getSimpleValueType();
11606   MVT DstSVT = DstVT.getScalarType();
11607   unsigned NumDstElts = DstVT.getVectorNumElements();
11608   unsigned NumSrcElts = SrcVT.getVectorNumElements();
11609   unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11610 
11611   if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11612     return SDValue();
11613 
11614   // Perform a direct ISD::TRUNCATE if possible.
11615   if (NumSrcElts == NumDstElts)
11616     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11617 
11618   if (NumSrcElts > NumDstElts) {
11619     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11620     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11621     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11622   }
11623 
11624   if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11625     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11626     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11627     return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11628                           DstVT.getSizeInBits());
11629   }
11630 
11631   // Non-VLX targets must truncate from a 512-bit type, so we need to
11632   // widen, truncate and then possibly extract the original subvector.
11633   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11634     SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11635     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11636   }
11637 
11638   // Fallback to a X86ISD::VTRUNC, padding if necessary.
11639   MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11640   SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11641   if (DstVT != TruncVT)
11642     Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11643                            DstVT.getSizeInBits());
11644   return Trunc;
11645 }
11646 
11647 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11648 //
11649 // An example is the following:
11650 //
11651 // t0: ch = EntryToken
11652 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11653 //         t25: v4i32 = truncate t2
11654 //       t41: v8i16 = bitcast t25
11655 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11656 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11657 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11658 //   t18: v2i64 = bitcast t51
11659 //
11660 // One can just use a single vpmovdw instruction, without avx512vl we need to
11661 // use the zmm variant and extract the lower subvector, padding with zeroes.
11662 // TODO: Merge with lowerShuffleAsVTRUNC.
11663 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11664                                      SDValue V2, ArrayRef<int> Mask,
11665                                      const APInt &Zeroable,
11666                                      const X86Subtarget &Subtarget,
11667                                      SelectionDAG &DAG) {
11668   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11669   if (!Subtarget.hasAVX512())
11670     return SDValue();
11671 
11672   unsigned NumElts = VT.getVectorNumElements();
11673   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11674   unsigned MaxScale = 64 / EltSizeInBits;
11675   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11676     unsigned NumSrcElts = NumElts / Scale;
11677     unsigned UpperElts = NumElts - NumSrcElts;
11678     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11679         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11680       continue;
11681 
11682     SDValue Src = V1;
11683     if (!Src.hasOneUse())
11684       return SDValue();
11685 
11686     Src = peekThroughOneUseBitcasts(Src);
11687     if (Src.getOpcode() != ISD::TRUNCATE ||
11688         Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11689       return SDValue();
11690     Src = Src.getOperand(0);
11691 
11692     // VPMOVWB is only available with avx512bw.
11693     MVT SrcVT = Src.getSimpleValueType();
11694     if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11695         !Subtarget.hasBWI())
11696       return SDValue();
11697 
11698     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11699     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11700   }
11701 
11702   return SDValue();
11703 }
11704 
11705 // Attempt to match binary shuffle patterns as a truncate.
11706 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11707                                     SDValue V2, ArrayRef<int> Mask,
11708                                     const APInt &Zeroable,
11709                                     const X86Subtarget &Subtarget,
11710                                     SelectionDAG &DAG) {
11711   assert((VT.is128BitVector() || VT.is256BitVector()) &&
11712          "Unexpected VTRUNC type");
11713   if (!Subtarget.hasAVX512())
11714     return SDValue();
11715 
11716   unsigned NumElts = VT.getVectorNumElements();
11717   unsigned EltSizeInBits = VT.getScalarSizeInBits();
11718   unsigned MaxScale = 64 / EltSizeInBits;
11719   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11720     // TODO: Support non-BWI VPMOVWB truncations?
11721     unsigned SrcEltBits = EltSizeInBits * Scale;
11722     if (SrcEltBits < 32 && !Subtarget.hasBWI())
11723       continue;
11724 
11725     // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11726     // Bail if the V2 elements are undef.
11727     unsigned NumHalfSrcElts = NumElts / Scale;
11728     unsigned NumSrcElts = 2 * NumHalfSrcElts;
11729     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11730         isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11731       continue;
11732 
11733     // The elements beyond the truncation must be undef/zero.
11734     unsigned UpperElts = NumElts - NumSrcElts;
11735     if (UpperElts > 0 &&
11736         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11737       continue;
11738     bool UndefUppers =
11739         UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11740 
11741     // As we're using both sources then we need to concat them together
11742     // and truncate from the double-sized src.
11743     MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11744     SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11745 
11746     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11747     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11748     Src = DAG.getBitcast(SrcVT, Src);
11749     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11750   }
11751 
11752   return SDValue();
11753 }
11754 
11755 /// Check whether a compaction lowering can be done by dropping even
11756 /// elements and compute how many times even elements must be dropped.
11757 ///
11758 /// This handles shuffles which take every Nth element where N is a power of
11759 /// two. Example shuffle masks:
11760 ///
11761 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11762 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11763 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11764 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11765 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11766 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11767 ///
11768 /// Any of these lanes can of course be undef.
11769 ///
11770 /// This routine only supports N <= 3.
11771 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11772 /// for larger N.
11773 ///
11774 /// \returns N above, or the number of times even elements must be dropped if
11775 /// there is such a number. Otherwise returns zero.
11776 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11777                                           bool IsSingleInput) {
11778   // The modulus for the shuffle vector entries is based on whether this is
11779   // a single input or not.
11780   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11781   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11782          "We should only be called with masks with a power-of-2 size!");
11783 
11784   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11785 
11786   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11787   // and 2^3 simultaneously. This is because we may have ambiguity with
11788   // partially undef inputs.
11789   bool ViableForN[3] = {true, true, true};
11790 
11791   for (int i = 0, e = Mask.size(); i < e; ++i) {
11792     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11793     // want.
11794     if (Mask[i] < 0)
11795       continue;
11796 
11797     bool IsAnyViable = false;
11798     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11799       if (ViableForN[j]) {
11800         uint64_t N = j + 1;
11801 
11802         // The shuffle mask must be equal to (i * 2^N) % M.
11803         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11804           IsAnyViable = true;
11805         else
11806           ViableForN[j] = false;
11807       }
11808     // Early exit if we exhaust the possible powers of two.
11809     if (!IsAnyViable)
11810       break;
11811   }
11812 
11813   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11814     if (ViableForN[j])
11815       return j + 1;
11816 
11817   // Return 0 as there is no viable power of two.
11818   return 0;
11819 }
11820 
11821 // X86 has dedicated pack instructions that can handle specific truncation
11822 // operations: PACKSS and PACKUS.
11823 // Checks for compaction shuffle masks if MaxStages > 1.
11824 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
11825 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11826                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
11827                                  const SelectionDAG &DAG,
11828                                  const X86Subtarget &Subtarget,
11829                                  unsigned MaxStages = 1) {
11830   unsigned NumElts = VT.getVectorNumElements();
11831   unsigned BitSize = VT.getScalarSizeInBits();
11832   assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11833          "Illegal maximum compaction");
11834 
11835   auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11836     unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11837     unsigned NumPackedBits = NumSrcBits - BitSize;
11838     N1 = peekThroughBitcasts(N1);
11839     N2 = peekThroughBitcasts(N2);
11840     unsigned NumBits1 = N1.getScalarValueSizeInBits();
11841     unsigned NumBits2 = N2.getScalarValueSizeInBits();
11842     bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11843     bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11844     if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11845         (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11846       return false;
11847     if (Subtarget.hasSSE41() || BitSize == 8) {
11848       APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11849       if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11850           (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11851         V1 = N1;
11852         V2 = N2;
11853         SrcVT = PackVT;
11854         PackOpcode = X86ISD::PACKUS;
11855         return true;
11856       }
11857     }
11858     bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11859     bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11860     if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11861          DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11862         (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11863          DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11864       V1 = N1;
11865       V2 = N2;
11866       SrcVT = PackVT;
11867       PackOpcode = X86ISD::PACKSS;
11868       return true;
11869     }
11870     return false;
11871   };
11872 
11873   // Attempt to match against wider and wider compaction patterns.
11874   for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11875     MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11876     MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11877 
11878     // Try binary shuffle.
11879     SmallVector<int, 32> BinaryMask;
11880     createPackShuffleMask(VT, BinaryMask, false, NumStages);
11881     if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11882       if (MatchPACK(V1, V2, PackVT))
11883         return true;
11884 
11885     // Try unary shuffle.
11886     SmallVector<int, 32> UnaryMask;
11887     createPackShuffleMask(VT, UnaryMask, true, NumStages);
11888     if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11889       if (MatchPACK(V1, V1, PackVT))
11890         return true;
11891   }
11892 
11893   return false;
11894 }
11895 
11896 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11897                                     SDValue V1, SDValue V2, SelectionDAG &DAG,
11898                                     const X86Subtarget &Subtarget) {
11899   MVT PackVT;
11900   unsigned PackOpcode;
11901   unsigned SizeBits = VT.getSizeInBits();
11902   unsigned EltBits = VT.getScalarSizeInBits();
11903   unsigned MaxStages = Log2_32(64 / EltBits);
11904   if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11905                             Subtarget, MaxStages))
11906     return SDValue();
11907 
11908   unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11909   unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11910 
11911   // Don't lower multi-stage packs on AVX512, truncation is better.
11912   if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11913     return SDValue();
11914 
11915   // Pack to the largest type possible:
11916   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11917   unsigned MaxPackBits = 16;
11918   if (CurrentEltBits > 16 &&
11919       (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11920     MaxPackBits = 32;
11921 
11922   // Repeatedly pack down to the target size.
11923   SDValue Res;
11924   for (unsigned i = 0; i != NumStages; ++i) {
11925     unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11926     unsigned NumSrcElts = SizeBits / SrcEltBits;
11927     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11928     MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11929     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11930     MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11931     Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11932                       DAG.getBitcast(SrcVT, V2));
11933     V1 = V2 = Res;
11934     CurrentEltBits /= 2;
11935   }
11936   assert(Res && Res.getValueType() == VT &&
11937          "Failed to lower compaction shuffle");
11938   return Res;
11939 }
11940 
11941 /// Try to emit a bitmask instruction for a shuffle.
11942 ///
11943 /// This handles cases where we can model a blend exactly as a bitmask due to
11944 /// one of the inputs being zeroable.
11945 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11946                                      SDValue V2, ArrayRef<int> Mask,
11947                                      const APInt &Zeroable,
11948                                      const X86Subtarget &Subtarget,
11949                                      SelectionDAG &DAG) {
11950   MVT MaskVT = VT;
11951   MVT EltVT = VT.getVectorElementType();
11952   SDValue Zero, AllOnes;
11953   // Use f64 if i64 isn't legal.
11954   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11955     EltVT = MVT::f64;
11956     MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11957   }
11958 
11959   MVT LogicVT = VT;
11960   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11961     Zero = DAG.getConstantFP(0.0, DL, EltVT);
11962     APFloat AllOnesValue = APFloat::getAllOnesValue(
11963         SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11964     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11965     LogicVT =
11966         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11967   } else {
11968     Zero = DAG.getConstant(0, DL, EltVT);
11969     AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11970   }
11971 
11972   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11973   SDValue V;
11974   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11975     if (Zeroable[i])
11976       continue;
11977     if (Mask[i] % Size != i)
11978       return SDValue(); // Not a blend.
11979     if (!V)
11980       V = Mask[i] < Size ? V1 : V2;
11981     else if (V != (Mask[i] < Size ? V1 : V2))
11982       return SDValue(); // Can only let one input through the mask.
11983 
11984     VMaskOps[i] = AllOnes;
11985   }
11986   if (!V)
11987     return SDValue(); // No non-zeroable elements!
11988 
11989   SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11990   VMask = DAG.getBitcast(LogicVT, VMask);
11991   V = DAG.getBitcast(LogicVT, V);
11992   SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11993   return DAG.getBitcast(VT, And);
11994 }
11995 
11996 /// Try to emit a blend instruction for a shuffle using bit math.
11997 ///
11998 /// This is used as a fallback approach when first class blend instructions are
11999 /// unavailable. Currently it is only suitable for integer vectors, but could
12000 /// be generalized for floating point vectors if desirable.
12001 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12002                                       SDValue V2, ArrayRef<int> Mask,
12003                                       SelectionDAG &DAG) {
12004   assert(VT.isInteger() && "Only supports integer vector types!");
12005   MVT EltVT = VT.getVectorElementType();
12006   SDValue Zero = DAG.getConstant(0, DL, EltVT);
12007   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12008   SmallVector<SDValue, 16> MaskOps;
12009   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12010     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12011       return SDValue(); // Shuffled input!
12012     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12013   }
12014 
12015   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12016   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12017   V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12018   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12019 }
12020 
12021 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12022                                     SDValue PreservedSrc,
12023                                     const X86Subtarget &Subtarget,
12024                                     SelectionDAG &DAG);
12025 
12026 static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12027                                 MutableArrayRef<int> Mask,
12028                                 const APInt &Zeroable, bool &ForceV1Zero,
12029                                 bool &ForceV2Zero, uint64_t &BlendMask) {
12030   bool V1IsZeroOrUndef =
12031       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12032   bool V2IsZeroOrUndef =
12033       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12034 
12035   BlendMask = 0;
12036   ForceV1Zero = false, ForceV2Zero = false;
12037   assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
12038 
12039   // Attempt to generate the binary blend mask. If an input is zero then
12040   // we can use any lane.
12041   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12042     int M = Mask[i];
12043     if (M == SM_SentinelUndef)
12044       continue;
12045     if (M == i)
12046       continue;
12047     if (M == i + Size) {
12048       BlendMask |= 1ull << i;
12049       continue;
12050     }
12051     if (Zeroable[i]) {
12052       if (V1IsZeroOrUndef) {
12053         ForceV1Zero = true;
12054         Mask[i] = i;
12055         continue;
12056       }
12057       if (V2IsZeroOrUndef) {
12058         ForceV2Zero = true;
12059         BlendMask |= 1ull << i;
12060         Mask[i] = i + Size;
12061         continue;
12062       }
12063     }
12064     return false;
12065   }
12066   return true;
12067 }
12068 
12069 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12070                                             int Scale) {
12071   uint64_t ScaledMask = 0;
12072   for (int i = 0; i != Size; ++i)
12073     if (BlendMask & (1ull << i))
12074       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12075   return ScaledMask;
12076 }
12077 
12078 /// Try to emit a blend instruction for a shuffle.
12079 ///
12080 /// This doesn't do any checks for the availability of instructions for blending
12081 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12082 /// be matched in the backend with the type given. What it does check for is
12083 /// that the shuffle mask is a blend, or convertible into a blend with zero.
12084 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12085                                    SDValue V2, ArrayRef<int> Original,
12086                                    const APInt &Zeroable,
12087                                    const X86Subtarget &Subtarget,
12088                                    SelectionDAG &DAG) {
12089   uint64_t BlendMask = 0;
12090   bool ForceV1Zero = false, ForceV2Zero = false;
12091   SmallVector<int, 64> Mask(Original.begin(), Original.end());
12092   if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12093                            BlendMask))
12094     return SDValue();
12095 
12096   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12097   if (ForceV1Zero)
12098     V1 = getZeroVector(VT, Subtarget, DAG, DL);
12099   if (ForceV2Zero)
12100     V2 = getZeroVector(VT, Subtarget, DAG, DL);
12101 
12102   switch (VT.SimpleTy) {
12103   case MVT::v4i64:
12104   case MVT::v8i32:
12105     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
12106     LLVM_FALLTHROUGH;
12107   case MVT::v4f64:
12108   case MVT::v8f32:
12109     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
12110     LLVM_FALLTHROUGH;
12111   case MVT::v2f64:
12112   case MVT::v2i64:
12113   case MVT::v4f32:
12114   case MVT::v4i32:
12115   case MVT::v8i16:
12116     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
12117     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12118                        DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12119   case MVT::v16i16: {
12120     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
12121     SmallVector<int, 8> RepeatedMask;
12122     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12123       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12124       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
12125       BlendMask = 0;
12126       for (int i = 0; i < 8; ++i)
12127         if (RepeatedMask[i] >= 8)
12128           BlendMask |= 1ull << i;
12129       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12130                          DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12131     }
12132     // Use PBLENDW for lower/upper lanes and then blend lanes.
12133     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12134     // merge to VSELECT where useful.
12135     uint64_t LoMask = BlendMask & 0xFF;
12136     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12137     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12138       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12139                                DAG.getTargetConstant(LoMask, DL, MVT::i8));
12140       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12141                                DAG.getTargetConstant(HiMask, DL, MVT::i8));
12142       return DAG.getVectorShuffle(
12143           MVT::v16i16, DL, Lo, Hi,
12144           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12145     }
12146     LLVM_FALLTHROUGH;
12147   }
12148   case MVT::v32i8:
12149     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
12150     LLVM_FALLTHROUGH;
12151   case MVT::v16i8: {
12152     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
12153 
12154     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12155     if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12156                                                Subtarget, DAG))
12157       return Masked;
12158 
12159     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12160       MVT IntegerType =
12161           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12162       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12163       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12164     }
12165 
12166     // If we have VPTERNLOG, we can use that as a bit blend.
12167     if (Subtarget.hasVLX())
12168       if (SDValue BitBlend =
12169               lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12170         return BitBlend;
12171 
12172     // Scale the blend by the number of bytes per element.
12173     int Scale = VT.getScalarSizeInBits() / 8;
12174 
12175     // This form of blend is always done on bytes. Compute the byte vector
12176     // type.
12177     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12178 
12179     // x86 allows load folding with blendvb from the 2nd source operand. But
12180     // we are still using LLVM select here (see comment below), so that's V1.
12181     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12182     // allow that load-folding possibility.
12183     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12184       ShuffleVectorSDNode::commuteMask(Mask);
12185       std::swap(V1, V2);
12186     }
12187 
12188     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12189     // mix of LLVM's code generator and the x86 backend. We tell the code
12190     // generator that boolean values in the elements of an x86 vector register
12191     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12192     // mapping a select to operand #1, and 'false' mapping to operand #2. The
12193     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12194     // of the element (the remaining are ignored) and 0 in that high bit would
12195     // mean operand #1 while 1 in the high bit would mean operand #2. So while
12196     // the LLVM model for boolean values in vector elements gets the relevant
12197     // bit set, it is set backwards and over constrained relative to x86's
12198     // actual model.
12199     SmallVector<SDValue, 32> VSELECTMask;
12200     for (int i = 0, Size = Mask.size(); i < Size; ++i)
12201       for (int j = 0; j < Scale; ++j)
12202         VSELECTMask.push_back(
12203             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12204                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12205                                           MVT::i8));
12206 
12207     V1 = DAG.getBitcast(BlendVT, V1);
12208     V2 = DAG.getBitcast(BlendVT, V2);
12209     return DAG.getBitcast(
12210         VT,
12211         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12212                       V1, V2));
12213   }
12214   case MVT::v16f32:
12215   case MVT::v8f64:
12216   case MVT::v8i64:
12217   case MVT::v16i32:
12218   case MVT::v32i16:
12219   case MVT::v64i8: {
12220     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12221     bool OptForSize = DAG.shouldOptForSize();
12222     if (!OptForSize) {
12223       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12224                                                  Subtarget, DAG))
12225         return Masked;
12226     }
12227 
12228     // Otherwise load an immediate into a GPR, cast to k-register, and use a
12229     // masked move.
12230     MVT IntegerType =
12231         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12232     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12233     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12234   }
12235   default:
12236     llvm_unreachable("Not a supported integer vector type!");
12237   }
12238 }
12239 
12240 /// Try to lower as a blend of elements from two inputs followed by
12241 /// a single-input permutation.
12242 ///
12243 /// This matches the pattern where we can blend elements from two inputs and
12244 /// then reduce the shuffle to a single-input permutation.
12245 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12246                                              SDValue V1, SDValue V2,
12247                                              ArrayRef<int> Mask,
12248                                              SelectionDAG &DAG,
12249                                              bool ImmBlends = false) {
12250   // We build up the blend mask while checking whether a blend is a viable way
12251   // to reduce the shuffle.
12252   SmallVector<int, 32> BlendMask(Mask.size(), -1);
12253   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12254 
12255   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12256     if (Mask[i] < 0)
12257       continue;
12258 
12259     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
12260 
12261     if (BlendMask[Mask[i] % Size] < 0)
12262       BlendMask[Mask[i] % Size] = Mask[i];
12263     else if (BlendMask[Mask[i] % Size] != Mask[i])
12264       return SDValue(); // Can't blend in the needed input!
12265 
12266     PermuteMask[i] = Mask[i] % Size;
12267   }
12268 
12269   // If only immediate blends, then bail if the blend mask can't be widened to
12270   // i16.
12271   unsigned EltSize = VT.getScalarSizeInBits();
12272   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12273     return SDValue();
12274 
12275   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12276   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12277 }
12278 
12279 /// Try to lower as an unpack of elements from two inputs followed by
12280 /// a single-input permutation.
12281 ///
12282 /// This matches the pattern where we can unpack elements from two inputs and
12283 /// then reduce the shuffle to a single-input (wider) permutation.
12284 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12285                                              SDValue V1, SDValue V2,
12286                                              ArrayRef<int> Mask,
12287                                              SelectionDAG &DAG) {
12288   int NumElts = Mask.size();
12289   int NumLanes = VT.getSizeInBits() / 128;
12290   int NumLaneElts = NumElts / NumLanes;
12291   int NumHalfLaneElts = NumLaneElts / 2;
12292 
12293   bool MatchLo = true, MatchHi = true;
12294   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12295 
12296   // Determine UNPCKL/UNPCKH type and operand order.
12297   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12298     for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12299       int M = Mask[Lane + Elt];
12300       if (M < 0)
12301         continue;
12302 
12303       SDValue &Op = Ops[Elt & 1];
12304       if (M < NumElts && (Op.isUndef() || Op == V1))
12305         Op = V1;
12306       else if (NumElts <= M && (Op.isUndef() || Op == V2))
12307         Op = V2;
12308       else
12309         return SDValue();
12310 
12311       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12312       MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12313                  isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12314       MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12315                  isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12316       if (!MatchLo && !MatchHi)
12317         return SDValue();
12318     }
12319   }
12320   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
12321 
12322   // Now check that each pair of elts come from the same unpack pair
12323   // and set the permute mask based on each pair.
12324   // TODO - Investigate cases where we permute individual elements.
12325   SmallVector<int, 32> PermuteMask(NumElts, -1);
12326   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12327     for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12328       int M0 = Mask[Lane + Elt + 0];
12329       int M1 = Mask[Lane + Elt + 1];
12330       if (0 <= M0 && 0 <= M1 &&
12331           (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12332         return SDValue();
12333       if (0 <= M0)
12334         PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12335       if (0 <= M1)
12336         PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12337     }
12338   }
12339 
12340   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12341   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12342   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12343 }
12344 
12345 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12346 /// permuting the elements of the result in place.
12347 static SDValue lowerShuffleAsByteRotateAndPermute(
12348     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12349     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12350   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12351       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12352       (VT.is512BitVector() && !Subtarget.hasBWI()))
12353     return SDValue();
12354 
12355   // We don't currently support lane crossing permutes.
12356   if (is128BitLaneCrossingShuffleMask(VT, Mask))
12357     return SDValue();
12358 
12359   int Scale = VT.getScalarSizeInBits() / 8;
12360   int NumLanes = VT.getSizeInBits() / 128;
12361   int NumElts = VT.getVectorNumElements();
12362   int NumEltsPerLane = NumElts / NumLanes;
12363 
12364   // Determine range of mask elts.
12365   bool Blend1 = true;
12366   bool Blend2 = true;
12367   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12368   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12369   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12370     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12371       int M = Mask[Lane + Elt];
12372       if (M < 0)
12373         continue;
12374       if (M < NumElts) {
12375         Blend1 &= (M == (Lane + Elt));
12376         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12377         M = M % NumEltsPerLane;
12378         Range1.first = std::min(Range1.first, M);
12379         Range1.second = std::max(Range1.second, M);
12380       } else {
12381         M -= NumElts;
12382         Blend2 &= (M == (Lane + Elt));
12383         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12384         M = M % NumEltsPerLane;
12385         Range2.first = std::min(Range2.first, M);
12386         Range2.second = std::max(Range2.second, M);
12387       }
12388     }
12389   }
12390 
12391   // Bail if we don't need both elements.
12392   // TODO - it might be worth doing this for unary shuffles if the permute
12393   // can be widened.
12394   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12395       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12396     return SDValue();
12397 
12398   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12399     return SDValue();
12400 
12401   // Rotate the 2 ops so we can access both ranges, then permute the result.
12402   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12403     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12404     SDValue Rotate = DAG.getBitcast(
12405         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12406                         DAG.getBitcast(ByteVT, Lo),
12407                         DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12408     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12409     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12410       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12411         int M = Mask[Lane + Elt];
12412         if (M < 0)
12413           continue;
12414         if (M < NumElts)
12415           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12416         else
12417           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12418       }
12419     }
12420     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12421   };
12422 
12423   // Check if the ranges are small enough to rotate from either direction.
12424   if (Range2.second < Range1.first)
12425     return RotateAndPermute(V1, V2, Range1.first, 0);
12426   if (Range1.second < Range2.first)
12427     return RotateAndPermute(V2, V1, Range2.first, NumElts);
12428   return SDValue();
12429 }
12430 
12431 /// Generic routine to decompose a shuffle and blend into independent
12432 /// blends and permutes.
12433 ///
12434 /// This matches the extremely common pattern for handling combined
12435 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12436 /// operations. It will try to pick the best arrangement of shuffles and
12437 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12438 static SDValue lowerShuffleAsDecomposedShuffleMerge(
12439     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12440     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12441   int NumElts = Mask.size();
12442   int NumLanes = VT.getSizeInBits() / 128;
12443   int NumEltsPerLane = NumElts / NumLanes;
12444 
12445   // Shuffle the input elements into the desired positions in V1 and V2 and
12446   // unpack/blend them together.
12447   bool IsAlternating = true;
12448   SmallVector<int, 32> V1Mask(NumElts, -1);
12449   SmallVector<int, 32> V2Mask(NumElts, -1);
12450   SmallVector<int, 32> FinalMask(NumElts, -1);
12451   for (int i = 0; i < NumElts; ++i) {
12452     int M = Mask[i];
12453     if (M >= 0 && M < NumElts) {
12454       V1Mask[i] = M;
12455       FinalMask[i] = i;
12456       IsAlternating &= (i & 1) == 0;
12457     } else if (M >= NumElts) {
12458       V2Mask[i] = M - NumElts;
12459       FinalMask[i] = i + NumElts;
12460       IsAlternating &= (i & 1) == 1;
12461     }
12462   }
12463 
12464   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12465   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12466   // the shuffle may be able to fold with a load or other benefit. However, when
12467   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12468   // pre-shuffle first is a better strategy.
12469   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12470     // Only prefer immediate blends to unpack/rotate.
12471     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12472                                                           DAG, true))
12473       return BlendPerm;
12474     if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12475                                                            DAG))
12476       return UnpackPerm;
12477     if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12478             DL, VT, V1, V2, Mask, Subtarget, DAG))
12479       return RotatePerm;
12480     // Unpack/rotate failed - try again with variable blends.
12481     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12482                                                           DAG))
12483       return BlendPerm;
12484   }
12485 
12486   // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12487   // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12488   // TODO: It doesn't have to be alternating - but each lane mustn't have more
12489   // than half the elements coming from each source.
12490   if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12491     V1Mask.assign(NumElts, -1);
12492     V2Mask.assign(NumElts, -1);
12493     FinalMask.assign(NumElts, -1);
12494     for (int i = 0; i != NumElts; i += NumEltsPerLane)
12495       for (int j = 0; j != NumEltsPerLane; ++j) {
12496         int M = Mask[i + j];
12497         if (M >= 0 && M < NumElts) {
12498           V1Mask[i + (j / 2)] = M;
12499           FinalMask[i + j] = i + (j / 2);
12500         } else if (M >= NumElts) {
12501           V2Mask[i + (j / 2)] = M - NumElts;
12502           FinalMask[i + j] = i + (j / 2) + NumElts;
12503         }
12504       }
12505   }
12506 
12507   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12508   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12509   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12510 }
12511 
12512 /// Try to lower a vector shuffle as a bit rotation.
12513 ///
12514 /// Look for a repeated rotation pattern in each sub group.
12515 /// Returns a ISD::ROTL element rotation amount or -1 if failed.
12516 static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12517   int NumElts = Mask.size();
12518   assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
12519 
12520   int RotateAmt = -1;
12521   for (int i = 0; i != NumElts; i += NumSubElts) {
12522     for (int j = 0; j != NumSubElts; ++j) {
12523       int M = Mask[i + j];
12524       if (M < 0)
12525         continue;
12526       if (!isInRange(M, i, i + NumSubElts))
12527         return -1;
12528       int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12529       if (0 <= RotateAmt && Offset != RotateAmt)
12530         return -1;
12531       RotateAmt = Offset;
12532     }
12533   }
12534   return RotateAmt;
12535 }
12536 
12537 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12538                                    const X86Subtarget &Subtarget,
12539                                    ArrayRef<int> Mask) {
12540   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12541   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12542 
12543   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12544   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12545   int MaxSubElts = 64 / EltSizeInBits;
12546   for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12547     int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12548     if (RotateAmt < 0)
12549       continue;
12550 
12551     int NumElts = Mask.size();
12552     MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12553     RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12554     return RotateAmt * EltSizeInBits;
12555   }
12556 
12557   return -1;
12558 }
12559 
12560 /// Lower shuffle using X86ISD::VROTLI rotations.
12561 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12562                                        ArrayRef<int> Mask,
12563                                        const X86Subtarget &Subtarget,
12564                                        SelectionDAG &DAG) {
12565   // Only XOP + AVX512 targets have bit rotation instructions.
12566   // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12567   bool IsLegal =
12568       (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12569   if (!IsLegal && Subtarget.hasSSE3())
12570     return SDValue();
12571 
12572   MVT RotateVT;
12573   int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12574                                           Subtarget, Mask);
12575   if (RotateAmt < 0)
12576     return SDValue();
12577 
12578   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12579   // expanded to OR(SRL,SHL), will be more efficient, but if they can
12580   // widen to vXi16 or more then existing lowering should will be better.
12581   if (!IsLegal) {
12582     if ((RotateAmt % 16) == 0)
12583       return SDValue();
12584     // TODO: Use getTargetVShiftByConstNode.
12585     unsigned ShlAmt = RotateAmt;
12586     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12587     V1 = DAG.getBitcast(RotateVT, V1);
12588     SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12589                               DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12590     SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12591                               DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12592     SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12593     return DAG.getBitcast(VT, Rot);
12594   }
12595 
12596   SDValue Rot =
12597       DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12598                   DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12599   return DAG.getBitcast(VT, Rot);
12600 }
12601 
12602 /// Try to match a vector shuffle as an element rotation.
12603 ///
12604 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12605 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12606                                        ArrayRef<int> Mask) {
12607   int NumElts = Mask.size();
12608 
12609   // We need to detect various ways of spelling a rotation:
12610   //   [11, 12, 13, 14, 15,  0,  1,  2]
12611   //   [-1, 12, 13, 14, -1, -1,  1, -1]
12612   //   [-1, -1, -1, -1, -1, -1,  1,  2]
12613   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
12614   //   [-1,  4,  5,  6, -1, -1,  9, -1]
12615   //   [-1,  4,  5,  6, -1, -1, -1, -1]
12616   int Rotation = 0;
12617   SDValue Lo, Hi;
12618   for (int i = 0; i < NumElts; ++i) {
12619     int M = Mask[i];
12620     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12621            "Unexpected mask index.");
12622     if (M < 0)
12623       continue;
12624 
12625     // Determine where a rotated vector would have started.
12626     int StartIdx = i - (M % NumElts);
12627     if (StartIdx == 0)
12628       // The identity rotation isn't interesting, stop.
12629       return -1;
12630 
12631     // If we found the tail of a vector the rotation must be the missing
12632     // front. If we found the head of a vector, it must be how much of the
12633     // head.
12634     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12635 
12636     if (Rotation == 0)
12637       Rotation = CandidateRotation;
12638     else if (Rotation != CandidateRotation)
12639       // The rotations don't match, so we can't match this mask.
12640       return -1;
12641 
12642     // Compute which value this mask is pointing at.
12643     SDValue MaskV = M < NumElts ? V1 : V2;
12644 
12645     // Compute which of the two target values this index should be assigned
12646     // to. This reflects whether the high elements are remaining or the low
12647     // elements are remaining.
12648     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12649 
12650     // Either set up this value if we've not encountered it before, or check
12651     // that it remains consistent.
12652     if (!TargetV)
12653       TargetV = MaskV;
12654     else if (TargetV != MaskV)
12655       // This may be a rotation, but it pulls from the inputs in some
12656       // unsupported interleaving.
12657       return -1;
12658   }
12659 
12660   // Check that we successfully analyzed the mask, and normalize the results.
12661   assert(Rotation != 0 && "Failed to locate a viable rotation!");
12662   assert((Lo || Hi) && "Failed to find a rotated input vector!");
12663   if (!Lo)
12664     Lo = Hi;
12665   else if (!Hi)
12666     Hi = Lo;
12667 
12668   V1 = Lo;
12669   V2 = Hi;
12670 
12671   return Rotation;
12672 }
12673 
12674 /// Try to lower a vector shuffle as a byte rotation.
12675 ///
12676 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12677 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12678 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12679 /// try to generically lower a vector shuffle through such an pattern. It
12680 /// does not check for the profitability of lowering either as PALIGNR or
12681 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12682 /// This matches shuffle vectors that look like:
12683 ///
12684 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12685 ///
12686 /// Essentially it concatenates V1 and V2, shifts right by some number of
12687 /// elements, and takes the low elements as the result. Note that while this is
12688 /// specified as a *right shift* because x86 is little-endian, it is a *left
12689 /// rotate* of the vector lanes.
12690 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12691                                     ArrayRef<int> Mask) {
12692   // Don't accept any shuffles with zero elements.
12693   if (isAnyZero(Mask))
12694     return -1;
12695 
12696   // PALIGNR works on 128-bit lanes.
12697   SmallVector<int, 16> RepeatedMask;
12698   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12699     return -1;
12700 
12701   int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12702   if (Rotation <= 0)
12703     return -1;
12704 
12705   // PALIGNR rotates bytes, so we need to scale the
12706   // rotation based on how many bytes are in the vector lane.
12707   int NumElts = RepeatedMask.size();
12708   int Scale = 16 / NumElts;
12709   return Rotation * Scale;
12710 }
12711 
12712 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12713                                         SDValue V2, ArrayRef<int> Mask,
12714                                         const X86Subtarget &Subtarget,
12715                                         SelectionDAG &DAG) {
12716   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12717 
12718   SDValue Lo = V1, Hi = V2;
12719   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12720   if (ByteRotation <= 0)
12721     return SDValue();
12722 
12723   // Cast the inputs to i8 vector of correct length to match PALIGNR or
12724   // PSLLDQ/PSRLDQ.
12725   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12726   Lo = DAG.getBitcast(ByteVT, Lo);
12727   Hi = DAG.getBitcast(ByteVT, Hi);
12728 
12729   // SSSE3 targets can use the palignr instruction.
12730   if (Subtarget.hasSSSE3()) {
12731     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12732            "512-bit PALIGNR requires BWI instructions");
12733     return DAG.getBitcast(
12734         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12735                         DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12736   }
12737 
12738   assert(VT.is128BitVector() &&
12739          "Rotate-based lowering only supports 128-bit lowering!");
12740   assert(Mask.size() <= 16 &&
12741          "Can shuffle at most 16 bytes in a 128-bit vector!");
12742   assert(ByteVT == MVT::v16i8 &&
12743          "SSE2 rotate lowering only needed for v16i8!");
12744 
12745   // Default SSE2 implementation
12746   int LoByteShift = 16 - ByteRotation;
12747   int HiByteShift = ByteRotation;
12748 
12749   SDValue LoShift =
12750       DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12751                   DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12752   SDValue HiShift =
12753       DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12754                   DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12755   return DAG.getBitcast(VT,
12756                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12757 }
12758 
12759 /// Try to lower a vector shuffle as a dword/qword rotation.
12760 ///
12761 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12762 /// rotation of the concatenation of two vectors; This routine will
12763 /// try to generically lower a vector shuffle through such an pattern.
12764 ///
12765 /// Essentially it concatenates V1 and V2, shifts right by some number of
12766 /// elements, and takes the low elements as the result. Note that while this is
12767 /// specified as a *right shift* because x86 is little-endian, it is a *left
12768 /// rotate* of the vector lanes.
12769 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12770                                     SDValue V2, ArrayRef<int> Mask,
12771                                     const X86Subtarget &Subtarget,
12772                                     SelectionDAG &DAG) {
12773   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12774          "Only 32-bit and 64-bit elements are supported!");
12775 
12776   // 128/256-bit vectors are only supported with VLX.
12777   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12778          && "VLX required for 128/256-bit vectors");
12779 
12780   SDValue Lo = V1, Hi = V2;
12781   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12782   if (Rotation <= 0)
12783     return SDValue();
12784 
12785   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12786                      DAG.getTargetConstant(Rotation, DL, MVT::i8));
12787 }
12788 
12789 /// Try to lower a vector shuffle as a byte shift sequence.
12790 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12791                                            SDValue V2, ArrayRef<int> Mask,
12792                                            const APInt &Zeroable,
12793                                            const X86Subtarget &Subtarget,
12794                                            SelectionDAG &DAG) {
12795   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12796   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12797 
12798   // We need a shuffle that has zeros at one/both ends and a sequential
12799   // shuffle from one source within.
12800   unsigned ZeroLo = Zeroable.countTrailingOnes();
12801   unsigned ZeroHi = Zeroable.countLeadingOnes();
12802   if (!ZeroLo && !ZeroHi)
12803     return SDValue();
12804 
12805   unsigned NumElts = Mask.size();
12806   unsigned Len = NumElts - (ZeroLo + ZeroHi);
12807   if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12808     return SDValue();
12809 
12810   unsigned Scale = VT.getScalarSizeInBits() / 8;
12811   ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12812   if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12813       !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12814     return SDValue();
12815 
12816   SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12817   Res = DAG.getBitcast(MVT::v16i8, Res);
12818 
12819   // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12820   // inner sequential set of elements, possibly offset:
12821   // 01234567 --> zzzzzz01 --> 1zzzzzzz
12822   // 01234567 --> 4567zzzz --> zzzzz456
12823   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12824   if (ZeroLo == 0) {
12825     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12826     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12827                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12828     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12829                       DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12830   } else if (ZeroHi == 0) {
12831     unsigned Shift = Mask[ZeroLo] % NumElts;
12832     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12833                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12834     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12835                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12836   } else if (!Subtarget.hasSSSE3()) {
12837     // If we don't have PSHUFB then its worth avoiding an AND constant mask
12838     // by performing 3 byte shifts. Shuffle combining can kick in above that.
12839     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12840     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12841     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12842                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12843     Shift += Mask[ZeroLo] % NumElts;
12844     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12845                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12846     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12847                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12848   } else
12849     return SDValue();
12850 
12851   return DAG.getBitcast(VT, Res);
12852 }
12853 
12854 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12855 ///
12856 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12857 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12858 /// matches elements from one of the input vectors shuffled to the left or
12859 /// right with zeroable elements 'shifted in'. It handles both the strictly
12860 /// bit-wise element shifts and the byte shift across an entire 128-bit double
12861 /// quad word lane.
12862 ///
12863 /// PSHL : (little-endian) left bit shift.
12864 /// [ zz, 0, zz,  2 ]
12865 /// [ -1, 4, zz, -1 ]
12866 /// PSRL : (little-endian) right bit shift.
12867 /// [  1, zz,  3, zz]
12868 /// [ -1, -1,  7, zz]
12869 /// PSLLDQ : (little-endian) left byte shift
12870 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
12871 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
12872 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
12873 /// PSRLDQ : (little-endian) right byte shift
12874 /// [  5, 6,  7, zz, zz, zz, zz, zz]
12875 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
12876 /// [  1, 2, -1, -1, -1, -1, zz, zz]
12877 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12878                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12879                                int MaskOffset, const APInt &Zeroable,
12880                                const X86Subtarget &Subtarget) {
12881   int Size = Mask.size();
12882   unsigned SizeInBits = Size * ScalarSizeInBits;
12883 
12884   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12885     for (int i = 0; i < Size; i += Scale)
12886       for (int j = 0; j < Shift; ++j)
12887         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12888           return false;
12889 
12890     return true;
12891   };
12892 
12893   auto MatchShift = [&](int Shift, int Scale, bool Left) {
12894     for (int i = 0; i != Size; i += Scale) {
12895       unsigned Pos = Left ? i + Shift : i;
12896       unsigned Low = Left ? i : i + Shift;
12897       unsigned Len = Scale - Shift;
12898       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12899         return -1;
12900     }
12901 
12902     int ShiftEltBits = ScalarSizeInBits * Scale;
12903     bool ByteShift = ShiftEltBits > 64;
12904     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12905                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12906     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12907 
12908     // Normalize the scale for byte shifts to still produce an i64 element
12909     // type.
12910     Scale = ByteShift ? Scale / 2 : Scale;
12911 
12912     // We need to round trip through the appropriate type for the shift.
12913     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12914     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12915                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
12916     return (int)ShiftAmt;
12917   };
12918 
12919   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12920   // keep doubling the size of the integer elements up to that. We can
12921   // then shift the elements of the integer vector by whole multiples of
12922   // their width within the elements of the larger integer vector. Test each
12923   // multiple to see if we can find a match with the moved element indices
12924   // and that the shifted in elements are all zeroable.
12925   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12926   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12927     for (int Shift = 1; Shift != Scale; ++Shift)
12928       for (bool Left : {true, false})
12929         if (CheckZeros(Shift, Scale, Left)) {
12930           int ShiftAmt = MatchShift(Shift, Scale, Left);
12931           if (0 < ShiftAmt)
12932             return ShiftAmt;
12933         }
12934 
12935   // no match
12936   return -1;
12937 }
12938 
12939 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12940                                    SDValue V2, ArrayRef<int> Mask,
12941                                    const APInt &Zeroable,
12942                                    const X86Subtarget &Subtarget,
12943                                    SelectionDAG &DAG) {
12944   int Size = Mask.size();
12945   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12946 
12947   MVT ShiftVT;
12948   SDValue V = V1;
12949   unsigned Opcode;
12950 
12951   // Try to match shuffle against V1 shift.
12952   int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12953                                      Mask, 0, Zeroable, Subtarget);
12954 
12955   // If V1 failed, try to match shuffle against V2 shift.
12956   if (ShiftAmt < 0) {
12957     ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12958                                    Mask, Size, Zeroable, Subtarget);
12959     V = V2;
12960   }
12961 
12962   if (ShiftAmt < 0)
12963     return SDValue();
12964 
12965   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12966          "Illegal integer vector type");
12967   V = DAG.getBitcast(ShiftVT, V);
12968   V = DAG.getNode(Opcode, DL, ShiftVT, V,
12969                   DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12970   return DAG.getBitcast(VT, V);
12971 }
12972 
12973 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12974 // Remainder of lower half result is zero and upper half is all undef.
12975 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12976                                 ArrayRef<int> Mask, uint64_t &BitLen,
12977                                 uint64_t &BitIdx, const APInt &Zeroable) {
12978   int Size = Mask.size();
12979   int HalfSize = Size / 2;
12980   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12981   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
12982 
12983   // Upper half must be undefined.
12984   if (!isUndefUpperHalf(Mask))
12985     return false;
12986 
12987   // Determine the extraction length from the part of the
12988   // lower half that isn't zeroable.
12989   int Len = HalfSize;
12990   for (; Len > 0; --Len)
12991     if (!Zeroable[Len - 1])
12992       break;
12993   assert(Len > 0 && "Zeroable shuffle mask");
12994 
12995   // Attempt to match first Len sequential elements from the lower half.
12996   SDValue Src;
12997   int Idx = -1;
12998   for (int i = 0; i != Len; ++i) {
12999     int M = Mask[i];
13000     if (M == SM_SentinelUndef)
13001       continue;
13002     SDValue &V = (M < Size ? V1 : V2);
13003     M = M % Size;
13004 
13005     // The extracted elements must start at a valid index and all mask
13006     // elements must be in the lower half.
13007     if (i > M || M >= HalfSize)
13008       return false;
13009 
13010     if (Idx < 0 || (Src == V && Idx == (M - i))) {
13011       Src = V;
13012       Idx = M - i;
13013       continue;
13014     }
13015     return false;
13016   }
13017 
13018   if (!Src || Idx < 0)
13019     return false;
13020 
13021   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
13022   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13023   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13024   V1 = Src;
13025   return true;
13026 }
13027 
13028 // INSERTQ: Extract lowest Len elements from lower half of second source and
13029 // insert over first source, starting at Idx.
13030 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13031 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13032                                   ArrayRef<int> Mask, uint64_t &BitLen,
13033                                   uint64_t &BitIdx) {
13034   int Size = Mask.size();
13035   int HalfSize = Size / 2;
13036   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
13037 
13038   // Upper half must be undefined.
13039   if (!isUndefUpperHalf(Mask))
13040     return false;
13041 
13042   for (int Idx = 0; Idx != HalfSize; ++Idx) {
13043     SDValue Base;
13044 
13045     // Attempt to match first source from mask before insertion point.
13046     if (isUndefInRange(Mask, 0, Idx)) {
13047       /* EMPTY */
13048     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13049       Base = V1;
13050     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13051       Base = V2;
13052     } else {
13053       continue;
13054     }
13055 
13056     // Extend the extraction length looking to match both the insertion of
13057     // the second source and the remaining elements of the first.
13058     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13059       SDValue Insert;
13060       int Len = Hi - Idx;
13061 
13062       // Match insertion.
13063       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13064         Insert = V1;
13065       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13066         Insert = V2;
13067       } else {
13068         continue;
13069       }
13070 
13071       // Match the remaining elements of the lower half.
13072       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13073         /* EMPTY */
13074       } else if ((!Base || (Base == V1)) &&
13075                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13076         Base = V1;
13077       } else if ((!Base || (Base == V2)) &&
13078                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13079                                             Size + Hi)) {
13080         Base = V2;
13081       } else {
13082         continue;
13083       }
13084 
13085       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13086       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13087       V1 = Base;
13088       V2 = Insert;
13089       return true;
13090     }
13091   }
13092 
13093   return false;
13094 }
13095 
13096 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13097 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13098                                      SDValue V2, ArrayRef<int> Mask,
13099                                      const APInt &Zeroable, SelectionDAG &DAG) {
13100   uint64_t BitLen, BitIdx;
13101   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13102     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13103                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13104                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13105 
13106   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13107     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13108                        V2 ? V2 : DAG.getUNDEF(VT),
13109                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
13110                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13111 
13112   return SDValue();
13113 }
13114 
13115 /// Lower a vector shuffle as a zero or any extension.
13116 ///
13117 /// Given a specific number of elements, element bit width, and extension
13118 /// stride, produce either a zero or any extension based on the available
13119 /// features of the subtarget. The extended elements are consecutive and
13120 /// begin and can start from an offsetted element index in the input; to
13121 /// avoid excess shuffling the offset must either being in the bottom lane
13122 /// or at the start of a higher lane. All extended elements must be from
13123 /// the same lane.
13124 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13125     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13126     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13127   assert(Scale > 1 && "Need a scale to extend.");
13128   int EltBits = VT.getScalarSizeInBits();
13129   int NumElements = VT.getVectorNumElements();
13130   int NumEltsPerLane = 128 / EltBits;
13131   int OffsetLane = Offset / NumEltsPerLane;
13132   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13133          "Only 8, 16, and 32 bit elements can be extended.");
13134   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
13135   assert(0 <= Offset && "Extension offset must be positive.");
13136   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
13137          "Extension offset must be in the first lane or start an upper lane.");
13138 
13139   // Check that an index is in same lane as the base offset.
13140   auto SafeOffset = [&](int Idx) {
13141     return OffsetLane == (Idx / NumEltsPerLane);
13142   };
13143 
13144   // Shift along an input so that the offset base moves to the first element.
13145   auto ShuffleOffset = [&](SDValue V) {
13146     if (!Offset)
13147       return V;
13148 
13149     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13150     for (int i = 0; i * Scale < NumElements; ++i) {
13151       int SrcIdx = i + Offset;
13152       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13153     }
13154     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13155   };
13156 
13157   // Found a valid a/zext mask! Try various lowering strategies based on the
13158   // input type and available ISA extensions.
13159   if (Subtarget.hasSSE41()) {
13160     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13161     // PUNPCK will catch this in a later shuffle match.
13162     if (Offset && Scale == 2 && VT.is128BitVector())
13163       return SDValue();
13164     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13165                                  NumElements / Scale);
13166     InputV = ShuffleOffset(InputV);
13167     InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13168                                     DL, ExtVT, InputV, DAG);
13169     return DAG.getBitcast(VT, InputV);
13170   }
13171 
13172   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13173 
13174   // For any extends we can cheat for larger element sizes and use shuffle
13175   // instructions that can fold with a load and/or copy.
13176   if (AnyExt && EltBits == 32) {
13177     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13178                          -1};
13179     return DAG.getBitcast(
13180         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13181                         DAG.getBitcast(MVT::v4i32, InputV),
13182                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13183   }
13184   if (AnyExt && EltBits == 16 && Scale > 2) {
13185     int PSHUFDMask[4] = {Offset / 2, -1,
13186                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13187     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13188                          DAG.getBitcast(MVT::v4i32, InputV),
13189                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13190     int PSHUFWMask[4] = {1, -1, -1, -1};
13191     unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13192     return DAG.getBitcast(
13193         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13194                         DAG.getBitcast(MVT::v8i16, InputV),
13195                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13196   }
13197 
13198   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13199   // to 64-bits.
13200   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13201     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13202     assert(VT.is128BitVector() && "Unexpected vector width!");
13203 
13204     int LoIdx = Offset * EltBits;
13205     SDValue Lo = DAG.getBitcast(
13206         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13207                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13208                                 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13209 
13210     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13211       return DAG.getBitcast(VT, Lo);
13212 
13213     int HiIdx = (Offset + 1) * EltBits;
13214     SDValue Hi = DAG.getBitcast(
13215         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13216                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13217                                 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13218     return DAG.getBitcast(VT,
13219                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13220   }
13221 
13222   // If this would require more than 2 unpack instructions to expand, use
13223   // pshufb when available. We can only use more than 2 unpack instructions
13224   // when zero extending i8 elements which also makes it easier to use pshufb.
13225   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13226     assert(NumElements == 16 && "Unexpected byte vector width!");
13227     SDValue PSHUFBMask[16];
13228     for (int i = 0; i < 16; ++i) {
13229       int Idx = Offset + (i / Scale);
13230       if ((i % Scale == 0 && SafeOffset(Idx))) {
13231         PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13232         continue;
13233       }
13234       PSHUFBMask[i] =
13235           AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13236     }
13237     InputV = DAG.getBitcast(MVT::v16i8, InputV);
13238     return DAG.getBitcast(
13239         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13240                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13241   }
13242 
13243   // If we are extending from an offset, ensure we start on a boundary that
13244   // we can unpack from.
13245   int AlignToUnpack = Offset % (NumElements / Scale);
13246   if (AlignToUnpack) {
13247     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13248     for (int i = AlignToUnpack; i < NumElements; ++i)
13249       ShMask[i - AlignToUnpack] = i;
13250     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13251     Offset -= AlignToUnpack;
13252   }
13253 
13254   // Otherwise emit a sequence of unpacks.
13255   do {
13256     unsigned UnpackLoHi = X86ISD::UNPCKL;
13257     if (Offset >= (NumElements / 2)) {
13258       UnpackLoHi = X86ISD::UNPCKH;
13259       Offset -= (NumElements / 2);
13260     }
13261 
13262     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13263     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13264                          : getZeroVector(InputVT, Subtarget, DAG, DL);
13265     InputV = DAG.getBitcast(InputVT, InputV);
13266     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13267     Scale /= 2;
13268     EltBits *= 2;
13269     NumElements /= 2;
13270   } while (Scale > 1);
13271   return DAG.getBitcast(VT, InputV);
13272 }
13273 
13274 /// Try to lower a vector shuffle as a zero extension on any microarch.
13275 ///
13276 /// This routine will try to do everything in its power to cleverly lower
13277 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
13278 /// check for the profitability of this lowering,  it tries to aggressively
13279 /// match this pattern. It will use all of the micro-architectural details it
13280 /// can to emit an efficient lowering. It handles both blends with all-zero
13281 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13282 /// masking out later).
13283 ///
13284 /// The reason we have dedicated lowering for zext-style shuffles is that they
13285 /// are both incredibly common and often quite performance sensitive.
13286 static SDValue lowerShuffleAsZeroOrAnyExtend(
13287     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13288     const APInt &Zeroable, const X86Subtarget &Subtarget,
13289     SelectionDAG &DAG) {
13290   int Bits = VT.getSizeInBits();
13291   int NumLanes = Bits / 128;
13292   int NumElements = VT.getVectorNumElements();
13293   int NumEltsPerLane = NumElements / NumLanes;
13294   assert(VT.getScalarSizeInBits() <= 32 &&
13295          "Exceeds 32-bit integer zero extension limit");
13296   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13297 
13298   // Define a helper function to check a particular ext-scale and lower to it if
13299   // valid.
13300   auto Lower = [&](int Scale) -> SDValue {
13301     SDValue InputV;
13302     bool AnyExt = true;
13303     int Offset = 0;
13304     int Matches = 0;
13305     for (int i = 0; i < NumElements; ++i) {
13306       int M = Mask[i];
13307       if (M < 0)
13308         continue; // Valid anywhere but doesn't tell us anything.
13309       if (i % Scale != 0) {
13310         // Each of the extended elements need to be zeroable.
13311         if (!Zeroable[i])
13312           return SDValue();
13313 
13314         // We no longer are in the anyext case.
13315         AnyExt = false;
13316         continue;
13317       }
13318 
13319       // Each of the base elements needs to be consecutive indices into the
13320       // same input vector.
13321       SDValue V = M < NumElements ? V1 : V2;
13322       M = M % NumElements;
13323       if (!InputV) {
13324         InputV = V;
13325         Offset = M - (i / Scale);
13326       } else if (InputV != V)
13327         return SDValue(); // Flip-flopping inputs.
13328 
13329       // Offset must start in the lowest 128-bit lane or at the start of an
13330       // upper lane.
13331       // FIXME: Is it ever worth allowing a negative base offset?
13332       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13333             (Offset % NumEltsPerLane) == 0))
13334         return SDValue();
13335 
13336       // If we are offsetting, all referenced entries must come from the same
13337       // lane.
13338       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13339         return SDValue();
13340 
13341       if ((M % NumElements) != (Offset + (i / Scale)))
13342         return SDValue(); // Non-consecutive strided elements.
13343       Matches++;
13344     }
13345 
13346     // If we fail to find an input, we have a zero-shuffle which should always
13347     // have already been handled.
13348     // FIXME: Maybe handle this here in case during blending we end up with one?
13349     if (!InputV)
13350       return SDValue();
13351 
13352     // If we are offsetting, don't extend if we only match a single input, we
13353     // can always do better by using a basic PSHUF or PUNPCK.
13354     if (Offset != 0 && Matches < 2)
13355       return SDValue();
13356 
13357     return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13358                                                  InputV, Mask, Subtarget, DAG);
13359   };
13360 
13361   // The widest scale possible for extending is to a 64-bit integer.
13362   assert(Bits % 64 == 0 &&
13363          "The number of bits in a vector must be divisible by 64 on x86!");
13364   int NumExtElements = Bits / 64;
13365 
13366   // Each iteration, try extending the elements half as much, but into twice as
13367   // many elements.
13368   for (; NumExtElements < NumElements; NumExtElements *= 2) {
13369     assert(NumElements % NumExtElements == 0 &&
13370            "The input vector size must be divisible by the extended size.");
13371     if (SDValue V = Lower(NumElements / NumExtElements))
13372       return V;
13373   }
13374 
13375   // General extends failed, but 128-bit vectors may be able to use MOVQ.
13376   if (Bits != 128)
13377     return SDValue();
13378 
13379   // Returns one of the source operands if the shuffle can be reduced to a
13380   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13381   auto CanZExtLowHalf = [&]() {
13382     for (int i = NumElements / 2; i != NumElements; ++i)
13383       if (!Zeroable[i])
13384         return SDValue();
13385     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13386       return V1;
13387     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13388       return V2;
13389     return SDValue();
13390   };
13391 
13392   if (SDValue V = CanZExtLowHalf()) {
13393     V = DAG.getBitcast(MVT::v2i64, V);
13394     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13395     return DAG.getBitcast(VT, V);
13396   }
13397 
13398   // No viable ext lowering found.
13399   return SDValue();
13400 }
13401 
13402 /// Try to get a scalar value for a specific element of a vector.
13403 ///
13404 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13405 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13406                                               SelectionDAG &DAG) {
13407   MVT VT = V.getSimpleValueType();
13408   MVT EltVT = VT.getVectorElementType();
13409   V = peekThroughBitcasts(V);
13410 
13411   // If the bitcasts shift the element size, we can't extract an equivalent
13412   // element from it.
13413   MVT NewVT = V.getSimpleValueType();
13414   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13415     return SDValue();
13416 
13417   if (V.getOpcode() == ISD::BUILD_VECTOR ||
13418       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13419     // Ensure the scalar operand is the same size as the destination.
13420     // FIXME: Add support for scalar truncation where possible.
13421     SDValue S = V.getOperand(Idx);
13422     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13423       return DAG.getBitcast(EltVT, S);
13424   }
13425 
13426   return SDValue();
13427 }
13428 
13429 /// Helper to test for a load that can be folded with x86 shuffles.
13430 ///
13431 /// This is particularly important because the set of instructions varies
13432 /// significantly based on whether the operand is a load or not.
13433 static bool isShuffleFoldableLoad(SDValue V) {
13434   V = peekThroughBitcasts(V);
13435   return ISD::isNON_EXTLoad(V.getNode());
13436 }
13437 
13438 /// Try to lower insertion of a single element into a zero vector.
13439 ///
13440 /// This is a common pattern that we have especially efficient patterns to lower
13441 /// across all subtarget feature sets.
13442 static SDValue lowerShuffleAsElementInsertion(
13443     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13444     const APInt &Zeroable, const X86Subtarget &Subtarget,
13445     SelectionDAG &DAG) {
13446   MVT ExtVT = VT;
13447   MVT EltVT = VT.getVectorElementType();
13448 
13449   int V2Index =
13450       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13451       Mask.begin();
13452   bool IsV1Zeroable = true;
13453   for (int i = 0, Size = Mask.size(); i < Size; ++i)
13454     if (i != V2Index && !Zeroable[i]) {
13455       IsV1Zeroable = false;
13456       break;
13457     }
13458 
13459   // Check for a single input from a SCALAR_TO_VECTOR node.
13460   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13461   // all the smarts here sunk into that routine. However, the current
13462   // lowering of BUILD_VECTOR makes that nearly impossible until the old
13463   // vector shuffle lowering is dead.
13464   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13465                                                DAG);
13466   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13467     // We need to zext the scalar if it is smaller than an i32.
13468     V2S = DAG.getBitcast(EltVT, V2S);
13469     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13470       // Using zext to expand a narrow element won't work for non-zero
13471       // insertions.
13472       if (!IsV1Zeroable)
13473         return SDValue();
13474 
13475       // Zero-extend directly to i32.
13476       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13477       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13478     }
13479     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13480   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13481              EltVT == MVT::i16) {
13482     // Either not inserting from the low element of the input or the input
13483     // element size is too small to use VZEXT_MOVL to clear the high bits.
13484     return SDValue();
13485   }
13486 
13487   if (!IsV1Zeroable) {
13488     // If V1 can't be treated as a zero vector we have fewer options to lower
13489     // this. We can't support integer vectors or non-zero targets cheaply, and
13490     // the V1 elements can't be permuted in any way.
13491     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13492     if (!VT.isFloatingPoint() || V2Index != 0)
13493       return SDValue();
13494     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13495     V1Mask[V2Index] = -1;
13496     if (!isNoopShuffleMask(V1Mask))
13497       return SDValue();
13498     if (!VT.is128BitVector())
13499       return SDValue();
13500 
13501     // Otherwise, use MOVSD or MOVSS.
13502     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
13503            "Only two types of floating point element types to handle!");
13504     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13505                        ExtVT, V1, V2);
13506   }
13507 
13508   // This lowering only works for the low element with floating point vectors.
13509   if (VT.isFloatingPoint() && V2Index != 0)
13510     return SDValue();
13511 
13512   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13513   if (ExtVT != VT)
13514     V2 = DAG.getBitcast(VT, V2);
13515 
13516   if (V2Index != 0) {
13517     // If we have 4 or fewer lanes we can cheaply shuffle the element into
13518     // the desired position. Otherwise it is more efficient to do a vector
13519     // shift left. We know that we can do a vector shift left because all
13520     // the inputs are zero.
13521     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13522       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13523       V2Shuffle[V2Index] = 0;
13524       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13525     } else {
13526       V2 = DAG.getBitcast(MVT::v16i8, V2);
13527       V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13528                        DAG.getTargetConstant(
13529                            V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13530       V2 = DAG.getBitcast(VT, V2);
13531     }
13532   }
13533   return V2;
13534 }
13535 
13536 /// Try to lower broadcast of a single - truncated - integer element,
13537 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13538 ///
13539 /// This assumes we have AVX2.
13540 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13541                                             int BroadcastIdx,
13542                                             const X86Subtarget &Subtarget,
13543                                             SelectionDAG &DAG) {
13544   assert(Subtarget.hasAVX2() &&
13545          "We can only lower integer broadcasts with AVX2!");
13546 
13547   MVT EltVT = VT.getVectorElementType();
13548   MVT V0VT = V0.getSimpleValueType();
13549 
13550   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13551   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13552 
13553   MVT V0EltVT = V0VT.getVectorElementType();
13554   if (!V0EltVT.isInteger())
13555     return SDValue();
13556 
13557   const unsigned EltSize = EltVT.getSizeInBits();
13558   const unsigned V0EltSize = V0EltVT.getSizeInBits();
13559 
13560   // This is only a truncation if the original element type is larger.
13561   if (V0EltSize <= EltSize)
13562     return SDValue();
13563 
13564   assert(((V0EltSize % EltSize) == 0) &&
13565          "Scalar type sizes must all be powers of 2 on x86!");
13566 
13567   const unsigned V0Opc = V0.getOpcode();
13568   const unsigned Scale = V0EltSize / EltSize;
13569   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13570 
13571   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13572       V0Opc != ISD::BUILD_VECTOR)
13573     return SDValue();
13574 
13575   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13576 
13577   // If we're extracting non-least-significant bits, shift so we can truncate.
13578   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13579   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13580   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13581   if (const int OffsetIdx = BroadcastIdx % Scale)
13582     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13583                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13584 
13585   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13586                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13587 }
13588 
13589 /// Test whether this can be lowered with a single SHUFPS instruction.
13590 ///
13591 /// This is used to disable more specialized lowerings when the shufps lowering
13592 /// will happen to be efficient.
13593 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13594   // This routine only handles 128-bit shufps.
13595   assert(Mask.size() == 4 && "Unsupported mask size!");
13596   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13597   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13598   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13599   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13600 
13601   // To lower with a single SHUFPS we need to have the low half and high half
13602   // each requiring a single input.
13603   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13604     return false;
13605   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13606     return false;
13607 
13608   return true;
13609 }
13610 
13611 /// If we are extracting two 128-bit halves of a vector and shuffling the
13612 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13613 /// multi-shuffle lowering.
13614 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13615                                              SDValue N1, ArrayRef<int> Mask,
13616                                              SelectionDAG &DAG) {
13617   MVT VT = N0.getSimpleValueType();
13618   assert((VT.is128BitVector() &&
13619           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13620          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13621 
13622   // Check that both sources are extracts of the same source vector.
13623   if (!N0.hasOneUse() || !N1.hasOneUse() ||
13624       N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13625       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13626       N0.getOperand(0) != N1.getOperand(0))
13627     return SDValue();
13628 
13629   SDValue WideVec = N0.getOperand(0);
13630   MVT WideVT = WideVec.getSimpleValueType();
13631   if (!WideVT.is256BitVector())
13632     return SDValue();
13633 
13634   // Match extracts of each half of the wide source vector. Commute the shuffle
13635   // if the extract of the low half is N1.
13636   unsigned NumElts = VT.getVectorNumElements();
13637   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13638   const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13639   const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13640   if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13641     ShuffleVectorSDNode::commuteMask(NewMask);
13642   else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13643     return SDValue();
13644 
13645   // Final bailout: if the mask is simple, we are better off using an extract
13646   // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13647   // because that avoids a constant load from memory.
13648   if (NumElts == 4 &&
13649       (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13650     return SDValue();
13651 
13652   // Extend the shuffle mask with undef elements.
13653   NewMask.append(NumElts, -1);
13654 
13655   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13656   SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13657                                       NewMask);
13658   // This is free: ymm -> xmm.
13659   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13660                      DAG.getIntPtrConstant(0, DL));
13661 }
13662 
13663 /// Try to lower broadcast of a single element.
13664 ///
13665 /// For convenience, this code also bundles all of the subtarget feature set
13666 /// filtering. While a little annoying to re-dispatch on type here, there isn't
13667 /// a convenient way to factor it out.
13668 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13669                                        SDValue V2, ArrayRef<int> Mask,
13670                                        const X86Subtarget &Subtarget,
13671                                        SelectionDAG &DAG) {
13672   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13673         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13674         (Subtarget.hasAVX2() && VT.isInteger())))
13675     return SDValue();
13676 
13677   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13678   // we can only broadcast from a register with AVX2.
13679   unsigned NumEltBits = VT.getScalarSizeInBits();
13680   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13681                         ? X86ISD::MOVDDUP
13682                         : X86ISD::VBROADCAST;
13683   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13684 
13685   // Check that the mask is a broadcast.
13686   int BroadcastIdx = getSplatIndex(Mask);
13687   if (BroadcastIdx < 0)
13688     return SDValue();
13689   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13690                                             "a sorted mask where the broadcast "
13691                                             "comes from V1.");
13692 
13693   // Go up the chain of (vector) values to find a scalar load that we can
13694   // combine with the broadcast.
13695   // TODO: Combine this logic with findEltLoadSrc() used by
13696   //       EltsFromConsecutiveLoads().
13697   int BitOffset = BroadcastIdx * NumEltBits;
13698   SDValue V = V1;
13699   for (;;) {
13700     switch (V.getOpcode()) {
13701     case ISD::BITCAST: {
13702       V = V.getOperand(0);
13703       continue;
13704     }
13705     case ISD::CONCAT_VECTORS: {
13706       int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13707       int OpIdx = BitOffset / OpBitWidth;
13708       V = V.getOperand(OpIdx);
13709       BitOffset %= OpBitWidth;
13710       continue;
13711     }
13712     case ISD::EXTRACT_SUBVECTOR: {
13713       // The extraction index adds to the existing offset.
13714       unsigned EltBitWidth = V.getScalarValueSizeInBits();
13715       unsigned Idx = V.getConstantOperandVal(1);
13716       unsigned BeginOffset = Idx * EltBitWidth;
13717       BitOffset += BeginOffset;
13718       V = V.getOperand(0);
13719       continue;
13720     }
13721     case ISD::INSERT_SUBVECTOR: {
13722       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13723       int EltBitWidth = VOuter.getScalarValueSizeInBits();
13724       int Idx = (int)V.getConstantOperandVal(2);
13725       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13726       int BeginOffset = Idx * EltBitWidth;
13727       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13728       if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13729         BitOffset -= BeginOffset;
13730         V = VInner;
13731       } else {
13732         V = VOuter;
13733       }
13734       continue;
13735     }
13736     }
13737     break;
13738   }
13739   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13740   BroadcastIdx = BitOffset / NumEltBits;
13741 
13742   // Do we need to bitcast the source to retrieve the original broadcast index?
13743   bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13744 
13745   // Check if this is a broadcast of a scalar. We special case lowering
13746   // for scalars so that we can more effectively fold with loads.
13747   // If the original value has a larger element type than the shuffle, the
13748   // broadcast element is in essence truncated. Make that explicit to ease
13749   // folding.
13750   if (BitCastSrc && VT.isInteger())
13751     if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13752             DL, VT, V, BroadcastIdx, Subtarget, DAG))
13753       return TruncBroadcast;
13754 
13755   // Also check the simpler case, where we can directly reuse the scalar.
13756   if (!BitCastSrc &&
13757       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13758        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13759     V = V.getOperand(BroadcastIdx);
13760 
13761     // If we can't broadcast from a register, check that the input is a load.
13762     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13763       return SDValue();
13764   } else if (ISD::isNormalLoad(V.getNode()) &&
13765              cast<LoadSDNode>(V)->isSimple()) {
13766     // We do not check for one-use of the vector load because a broadcast load
13767     // is expected to be a win for code size, register pressure, and possibly
13768     // uops even if the original vector load is not eliminated.
13769 
13770     // Reduce the vector load and shuffle to a broadcasted scalar load.
13771     LoadSDNode *Ld = cast<LoadSDNode>(V);
13772     SDValue BaseAddr = Ld->getOperand(1);
13773     MVT SVT = VT.getScalarType();
13774     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13775     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13776     SDValue NewAddr =
13777         DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13778 
13779     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13780     // than MOVDDUP.
13781     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13782     if (Opcode == X86ISD::VBROADCAST) {
13783       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13784       SDValue Ops[] = {Ld->getChain(), NewAddr};
13785       V = DAG.getMemIntrinsicNode(
13786           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13787           DAG.getMachineFunction().getMachineMemOperand(
13788               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13789       DAG.makeEquivalentMemoryOrdering(Ld, V);
13790       return DAG.getBitcast(VT, V);
13791     }
13792     assert(SVT == MVT::f64 && "Unexpected VT!");
13793     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13794                     DAG.getMachineFunction().getMachineMemOperand(
13795                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13796     DAG.makeEquivalentMemoryOrdering(Ld, V);
13797   } else if (!BroadcastFromReg) {
13798     // We can't broadcast from a vector register.
13799     return SDValue();
13800   } else if (BitOffset != 0) {
13801     // We can only broadcast from the zero-element of a vector register,
13802     // but it can be advantageous to broadcast from the zero-element of a
13803     // subvector.
13804     if (!VT.is256BitVector() && !VT.is512BitVector())
13805       return SDValue();
13806 
13807     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13808     if (VT == MVT::v4f64 || VT == MVT::v4i64)
13809       return SDValue();
13810 
13811     // Only broadcast the zero-element of a 128-bit subvector.
13812     if ((BitOffset % 128) != 0)
13813       return SDValue();
13814 
13815     assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13816            "Unexpected bit-offset");
13817     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13818            "Unexpected vector size");
13819     unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13820     V = extract128BitVector(V, ExtractIdx, DAG, DL);
13821   }
13822 
13823   // On AVX we can use VBROADCAST directly for scalar sources.
13824   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13825     V = DAG.getBitcast(MVT::f64, V);
13826     if (Subtarget.hasAVX()) {
13827       V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13828       return DAG.getBitcast(VT, V);
13829     }
13830     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13831   }
13832 
13833   // If this is a scalar, do the broadcast on this type and bitcast.
13834   if (!V.getValueType().isVector()) {
13835     assert(V.getScalarValueSizeInBits() == NumEltBits &&
13836            "Unexpected scalar size");
13837     MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13838                                        VT.getVectorNumElements());
13839     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13840   }
13841 
13842   // We only support broadcasting from 128-bit vectors to minimize the
13843   // number of patterns we need to deal with in isel. So extract down to
13844   // 128-bits, removing as many bitcasts as possible.
13845   if (V.getValueSizeInBits() > 128)
13846     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13847 
13848   // Otherwise cast V to a vector with the same element type as VT, but
13849   // possibly narrower than VT. Then perform the broadcast.
13850   unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13851   MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13852   return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13853 }
13854 
13855 // Check for whether we can use INSERTPS to perform the shuffle. We only use
13856 // INSERTPS when the V1 elements are already in the correct locations
13857 // because otherwise we can just always use two SHUFPS instructions which
13858 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13859 // perform INSERTPS if a single V1 element is out of place and all V2
13860 // elements are zeroable.
13861 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13862                                    unsigned &InsertPSMask,
13863                                    const APInt &Zeroable,
13864                                    ArrayRef<int> Mask, SelectionDAG &DAG) {
13865   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13866   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13867   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13868 
13869   // Attempt to match INSERTPS with one element from VA or VB being
13870   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13871   // are updated.
13872   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13873                              ArrayRef<int> CandidateMask) {
13874     unsigned ZMask = 0;
13875     int VADstIndex = -1;
13876     int VBDstIndex = -1;
13877     bool VAUsedInPlace = false;
13878 
13879     for (int i = 0; i < 4; ++i) {
13880       // Synthesize a zero mask from the zeroable elements (includes undefs).
13881       if (Zeroable[i]) {
13882         ZMask |= 1 << i;
13883         continue;
13884       }
13885 
13886       // Flag if we use any VA inputs in place.
13887       if (i == CandidateMask[i]) {
13888         VAUsedInPlace = true;
13889         continue;
13890       }
13891 
13892       // We can only insert a single non-zeroable element.
13893       if (VADstIndex >= 0 || VBDstIndex >= 0)
13894         return false;
13895 
13896       if (CandidateMask[i] < 4) {
13897         // VA input out of place for insertion.
13898         VADstIndex = i;
13899       } else {
13900         // VB input for insertion.
13901         VBDstIndex = i;
13902       }
13903     }
13904 
13905     // Don't bother if we have no (non-zeroable) element for insertion.
13906     if (VADstIndex < 0 && VBDstIndex < 0)
13907       return false;
13908 
13909     // Determine element insertion src/dst indices. The src index is from the
13910     // start of the inserted vector, not the start of the concatenated vector.
13911     unsigned VBSrcIndex = 0;
13912     if (VADstIndex >= 0) {
13913       // If we have a VA input out of place, we use VA as the V2 element
13914       // insertion and don't use the original V2 at all.
13915       VBSrcIndex = CandidateMask[VADstIndex];
13916       VBDstIndex = VADstIndex;
13917       VB = VA;
13918     } else {
13919       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13920     }
13921 
13922     // If no V1 inputs are used in place, then the result is created only from
13923     // the zero mask and the V2 insertion - so remove V1 dependency.
13924     if (!VAUsedInPlace)
13925       VA = DAG.getUNDEF(MVT::v4f32);
13926 
13927     // Update V1, V2 and InsertPSMask accordingly.
13928     V1 = VA;
13929     V2 = VB;
13930 
13931     // Insert the V2 element into the desired position.
13932     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13933     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13934     return true;
13935   };
13936 
13937   if (matchAsInsertPS(V1, V2, Mask))
13938     return true;
13939 
13940   // Commute and try again.
13941   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13942   ShuffleVectorSDNode::commuteMask(CommutedMask);
13943   if (matchAsInsertPS(V2, V1, CommutedMask))
13944     return true;
13945 
13946   return false;
13947 }
13948 
13949 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13950                                       ArrayRef<int> Mask, const APInt &Zeroable,
13951                                       SelectionDAG &DAG) {
13952   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13953   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13954 
13955   // Attempt to match the insertps pattern.
13956   unsigned InsertPSMask = 0;
13957   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13958     return SDValue();
13959 
13960   // Insert the V2 element into the desired position.
13961   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13962                      DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13963 }
13964 
13965 /// Try to lower a shuffle as a permute of the inputs followed by an
13966 /// UNPCK instruction.
13967 ///
13968 /// This specifically targets cases where we end up with alternating between
13969 /// the two inputs, and so can permute them into something that feeds a single
13970 /// UNPCK instruction. Note that this routine only targets integer vectors
13971 /// because for floating point vectors we have a generalized SHUFPS lowering
13972 /// strategy that handles everything that doesn't *exactly* match an unpack,
13973 /// making this clever lowering unnecessary.
13974 static SDValue lowerShuffleAsPermuteAndUnpack(
13975     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13976     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13977   assert(!VT.isFloatingPoint() &&
13978          "This routine only supports integer vectors.");
13979   assert(VT.is128BitVector() &&
13980          "This routine only works on 128-bit vectors.");
13981   assert(!V2.isUndef() &&
13982          "This routine should only be used when blending two inputs.");
13983   assert(Mask.size() >= 2 && "Single element masks are invalid.");
13984 
13985   int Size = Mask.size();
13986 
13987   int NumLoInputs =
13988       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13989   int NumHiInputs =
13990       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13991 
13992   bool UnpackLo = NumLoInputs >= NumHiInputs;
13993 
13994   auto TryUnpack = [&](int ScalarSize, int Scale) {
13995     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13996     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13997 
13998     for (int i = 0; i < Size; ++i) {
13999       if (Mask[i] < 0)
14000         continue;
14001 
14002       // Each element of the unpack contains Scale elements from this mask.
14003       int UnpackIdx = i / Scale;
14004 
14005       // We only handle the case where V1 feeds the first slots of the unpack.
14006       // We rely on canonicalization to ensure this is the case.
14007       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14008         return SDValue();
14009 
14010       // Setup the mask for this input. The indexing is tricky as we have to
14011       // handle the unpack stride.
14012       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14013       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14014           Mask[i] % Size;
14015     }
14016 
14017     // If we will have to shuffle both inputs to use the unpack, check whether
14018     // we can just unpack first and shuffle the result. If so, skip this unpack.
14019     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14020         !isNoopShuffleMask(V2Mask))
14021       return SDValue();
14022 
14023     // Shuffle the inputs into place.
14024     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14025     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14026 
14027     // Cast the inputs to the type we will use to unpack them.
14028     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14029     V1 = DAG.getBitcast(UnpackVT, V1);
14030     V2 = DAG.getBitcast(UnpackVT, V2);
14031 
14032     // Unpack the inputs and cast the result back to the desired type.
14033     return DAG.getBitcast(
14034         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14035                         UnpackVT, V1, V2));
14036   };
14037 
14038   // We try each unpack from the largest to the smallest to try and find one
14039   // that fits this mask.
14040   int OrigScalarSize = VT.getScalarSizeInBits();
14041   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14042     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14043       return Unpack;
14044 
14045   // If we're shuffling with a zero vector then we're better off not doing
14046   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14047   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14048       ISD::isBuildVectorAllZeros(V2.getNode()))
14049     return SDValue();
14050 
14051   // If none of the unpack-rooted lowerings worked (or were profitable) try an
14052   // initial unpack.
14053   if (NumLoInputs == 0 || NumHiInputs == 0) {
14054     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
14055            "We have to have *some* inputs!");
14056     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14057 
14058     // FIXME: We could consider the total complexity of the permute of each
14059     // possible unpacking. Or at the least we should consider how many
14060     // half-crossings are created.
14061     // FIXME: We could consider commuting the unpacks.
14062 
14063     SmallVector<int, 32> PermMask((unsigned)Size, -1);
14064     for (int i = 0; i < Size; ++i) {
14065       if (Mask[i] < 0)
14066         continue;
14067 
14068       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
14069 
14070       PermMask[i] =
14071           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14072     }
14073     return DAG.getVectorShuffle(
14074         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14075                             DL, VT, V1, V2),
14076         DAG.getUNDEF(VT), PermMask);
14077   }
14078 
14079   return SDValue();
14080 }
14081 
14082 /// Handle lowering of 2-lane 64-bit floating point shuffles.
14083 ///
14084 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
14085 /// support for floating point shuffles but not integer shuffles. These
14086 /// instructions will incur a domain crossing penalty on some chips though so
14087 /// it is better to avoid lowering through this for integer vectors where
14088 /// possible.
14089 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14090                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14091                                  const X86Subtarget &Subtarget,
14092                                  SelectionDAG &DAG) {
14093   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14094   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
14095   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14096 
14097   if (V2.isUndef()) {
14098     // Check for being able to broadcast a single element.
14099     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14100                                                     Mask, Subtarget, DAG))
14101       return Broadcast;
14102 
14103     // Straight shuffle of a single input vector. Simulate this by using the
14104     // single input as both of the "inputs" to this instruction..
14105     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14106 
14107     if (Subtarget.hasAVX()) {
14108       // If we have AVX, we can use VPERMILPS which will allow folding a load
14109       // into the shuffle.
14110       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14111                          DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14112     }
14113 
14114     return DAG.getNode(
14115         X86ISD::SHUFP, DL, MVT::v2f64,
14116         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14117         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14118         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14119   }
14120   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14121   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14122   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14123   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14124 
14125   if (Subtarget.hasAVX2())
14126     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14127       return Extract;
14128 
14129   // When loading a scalar and then shuffling it into a vector we can often do
14130   // the insertion cheaply.
14131   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14132           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14133     return Insertion;
14134   // Try inverting the insertion since for v2 masks it is easy to do and we
14135   // can't reliably sort the mask one way or the other.
14136   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14137                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14138   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14139           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14140     return Insertion;
14141 
14142   // Try to use one of the special instruction patterns to handle two common
14143   // blend patterns if a zero-blend above didn't work.
14144   if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14145       isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14146     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14147       // We can either use a special instruction to load over the low double or
14148       // to move just the low double.
14149       return DAG.getNode(
14150           X86ISD::MOVSD, DL, MVT::v2f64, V2,
14151           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14152 
14153   if (Subtarget.hasSSE41())
14154     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14155                                             Zeroable, Subtarget, DAG))
14156       return Blend;
14157 
14158   // Use dedicated unpack instructions for masks that match their pattern.
14159   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14160     return V;
14161 
14162   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14163   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14164                      DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14165 }
14166 
14167 /// Handle lowering of 2-lane 64-bit integer shuffles.
14168 ///
14169 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14170 /// the integer unit to minimize domain crossing penalties. However, for blends
14171 /// it falls back to the floating point shuffle operation with appropriate bit
14172 /// casting.
14173 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14174                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14175                                  const X86Subtarget &Subtarget,
14176                                  SelectionDAG &DAG) {
14177   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14178   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14179   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14180 
14181   if (V2.isUndef()) {
14182     // Check for being able to broadcast a single element.
14183     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14184                                                     Mask, Subtarget, DAG))
14185       return Broadcast;
14186 
14187     // Straight shuffle of a single input vector. For everything from SSE2
14188     // onward this has a single fast instruction with no scary immediates.
14189     // We have to map the mask as it is actually a v4i32 shuffle instruction.
14190     V1 = DAG.getBitcast(MVT::v4i32, V1);
14191     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14192                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14193                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
14194                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14195     return DAG.getBitcast(
14196         MVT::v2i64,
14197         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14198                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14199   }
14200   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14201   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14202   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14203   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14204 
14205   if (Subtarget.hasAVX2())
14206     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14207       return Extract;
14208 
14209   // Try to use shift instructions.
14210   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14211                                           Zeroable, Subtarget, DAG))
14212     return Shift;
14213 
14214   // When loading a scalar and then shuffling it into a vector we can often do
14215   // the insertion cheaply.
14216   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14217           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14218     return Insertion;
14219   // Try inverting the insertion since for v2 masks it is easy to do and we
14220   // can't reliably sort the mask one way or the other.
14221   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14222   if (SDValue Insertion = lowerShuffleAsElementInsertion(
14223           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14224     return Insertion;
14225 
14226   // We have different paths for blend lowering, but they all must use the
14227   // *exact* same predicate.
14228   bool IsBlendSupported = Subtarget.hasSSE41();
14229   if (IsBlendSupported)
14230     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14231                                             Zeroable, Subtarget, DAG))
14232       return Blend;
14233 
14234   // Use dedicated unpack instructions for masks that match their pattern.
14235   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14236     return V;
14237 
14238   // Try to use byte rotation instructions.
14239   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14240   if (Subtarget.hasSSSE3()) {
14241     if (Subtarget.hasVLX())
14242       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14243                                                 Subtarget, DAG))
14244         return Rotate;
14245 
14246     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14247                                                   Subtarget, DAG))
14248       return Rotate;
14249   }
14250 
14251   // If we have direct support for blends, we should lower by decomposing into
14252   // a permute. That will be faster than the domain cross.
14253   if (IsBlendSupported)
14254     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14255                                                 Subtarget, DAG);
14256 
14257   // We implement this with SHUFPD which is pretty lame because it will likely
14258   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14259   // However, all the alternatives are still more cycles and newer chips don't
14260   // have this problem. It would be really nice if x86 had better shuffles here.
14261   V1 = DAG.getBitcast(MVT::v2f64, V1);
14262   V2 = DAG.getBitcast(MVT::v2f64, V2);
14263   return DAG.getBitcast(MVT::v2i64,
14264                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14265 }
14266 
14267 /// Lower a vector shuffle using the SHUFPS instruction.
14268 ///
14269 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14270 /// It makes no assumptions about whether this is the *best* lowering, it simply
14271 /// uses it.
14272 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14273                                       ArrayRef<int> Mask, SDValue V1,
14274                                       SDValue V2, SelectionDAG &DAG) {
14275   SDValue LowV = V1, HighV = V2;
14276   SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14277   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14278 
14279   if (NumV2Elements == 1) {
14280     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14281 
14282     // Compute the index adjacent to V2Index and in the same half by toggling
14283     // the low bit.
14284     int V2AdjIndex = V2Index ^ 1;
14285 
14286     if (Mask[V2AdjIndex] < 0) {
14287       // Handles all the cases where we have a single V2 element and an undef.
14288       // This will only ever happen in the high lanes because we commute the
14289       // vector otherwise.
14290       if (V2Index < 2)
14291         std::swap(LowV, HighV);
14292       NewMask[V2Index] -= 4;
14293     } else {
14294       // Handle the case where the V2 element ends up adjacent to a V1 element.
14295       // To make this work, blend them together as the first step.
14296       int V1Index = V2AdjIndex;
14297       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14298       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14299                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14300 
14301       // Now proceed to reconstruct the final blend as we have the necessary
14302       // high or low half formed.
14303       if (V2Index < 2) {
14304         LowV = V2;
14305         HighV = V1;
14306       } else {
14307         HighV = V2;
14308       }
14309       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14310       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14311     }
14312   } else if (NumV2Elements == 2) {
14313     if (Mask[0] < 4 && Mask[1] < 4) {
14314       // Handle the easy case where we have V1 in the low lanes and V2 in the
14315       // high lanes.
14316       NewMask[2] -= 4;
14317       NewMask[3] -= 4;
14318     } else if (Mask[2] < 4 && Mask[3] < 4) {
14319       // We also handle the reversed case because this utility may get called
14320       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14321       // arrange things in the right direction.
14322       NewMask[0] -= 4;
14323       NewMask[1] -= 4;
14324       HighV = V1;
14325       LowV = V2;
14326     } else {
14327       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14328       // trying to place elements directly, just blend them and set up the final
14329       // shuffle to place them.
14330 
14331       // The first two blend mask elements are for V1, the second two are for
14332       // V2.
14333       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14334                           Mask[2] < 4 ? Mask[2] : Mask[3],
14335                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14336                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14337       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14338                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14339 
14340       // Now we do a normal shuffle of V1 by giving V1 as both operands to
14341       // a blend.
14342       LowV = HighV = V1;
14343       NewMask[0] = Mask[0] < 4 ? 0 : 2;
14344       NewMask[1] = Mask[0] < 4 ? 2 : 0;
14345       NewMask[2] = Mask[2] < 4 ? 1 : 3;
14346       NewMask[3] = Mask[2] < 4 ? 3 : 1;
14347     }
14348   } else if (NumV2Elements == 3) {
14349     // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14350     // we can get here due to other paths (e.g repeated mask matching) that we
14351     // don't want to do another round of lowerVECTOR_SHUFFLE.
14352     ShuffleVectorSDNode::commuteMask(NewMask);
14353     return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14354   }
14355   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14356                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14357 }
14358 
14359 /// Lower 4-lane 32-bit floating point shuffles.
14360 ///
14361 /// Uses instructions exclusively from the floating point unit to minimize
14362 /// domain crossing penalties, as these are sufficient to implement all v4f32
14363 /// shuffles.
14364 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14365                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14366                                  const X86Subtarget &Subtarget,
14367                                  SelectionDAG &DAG) {
14368   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14369   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14370   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14371 
14372   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14373 
14374   if (NumV2Elements == 0) {
14375     // Check for being able to broadcast a single element.
14376     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14377                                                     Mask, Subtarget, DAG))
14378       return Broadcast;
14379 
14380     // Use even/odd duplicate instructions for masks that match their pattern.
14381     if (Subtarget.hasSSE3()) {
14382       if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14383         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14384       if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14385         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14386     }
14387 
14388     if (Subtarget.hasAVX()) {
14389       // If we have AVX, we can use VPERMILPS which will allow folding a load
14390       // into the shuffle.
14391       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14392                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14393     }
14394 
14395     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14396     // in SSE1 because otherwise they are widened to v2f64 and never get here.
14397     if (!Subtarget.hasSSE2()) {
14398       if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14399         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14400       if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14401         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14402     }
14403 
14404     // Otherwise, use a straight shuffle of a single input vector. We pass the
14405     // input vector to both operands to simulate this with a SHUFPS.
14406     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14407                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14408   }
14409 
14410   if (Subtarget.hasAVX2())
14411     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14412       return Extract;
14413 
14414   // There are special ways we can lower some single-element blends. However, we
14415   // have custom ways we can lower more complex single-element blends below that
14416   // we defer to if both this and BLENDPS fail to match, so restrict this to
14417   // when the V2 input is targeting element 0 of the mask -- that is the fast
14418   // case here.
14419   if (NumV2Elements == 1 && Mask[0] >= 4)
14420     if (SDValue V = lowerShuffleAsElementInsertion(
14421             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14422       return V;
14423 
14424   if (Subtarget.hasSSE41()) {
14425     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14426                                             Zeroable, Subtarget, DAG))
14427       return Blend;
14428 
14429     // Use INSERTPS if we can complete the shuffle efficiently.
14430     if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14431       return V;
14432 
14433     if (!isSingleSHUFPSMask(Mask))
14434       if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14435                                                             V2, Mask, DAG))
14436         return BlendPerm;
14437   }
14438 
14439   // Use low/high mov instructions. These are only valid in SSE1 because
14440   // otherwise they are widened to v2f64 and never get here.
14441   if (!Subtarget.hasSSE2()) {
14442     if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14443       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14444     if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14445       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14446   }
14447 
14448   // Use dedicated unpack instructions for masks that match their pattern.
14449   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14450     return V;
14451 
14452   // Otherwise fall back to a SHUFPS lowering strategy.
14453   return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14454 }
14455 
14456 /// Lower 4-lane i32 vector shuffles.
14457 ///
14458 /// We try to handle these with integer-domain shuffles where we can, but for
14459 /// blends we use the floating point domain blend instructions.
14460 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14461                                  const APInt &Zeroable, SDValue V1, SDValue V2,
14462                                  const X86Subtarget &Subtarget,
14463                                  SelectionDAG &DAG) {
14464   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14465   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14466   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14467 
14468   // Whenever we can lower this as a zext, that instruction is strictly faster
14469   // than any alternative. It also allows us to fold memory operands into the
14470   // shuffle in many cases.
14471   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14472                                                    Zeroable, Subtarget, DAG))
14473     return ZExt;
14474 
14475   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14476 
14477   if (NumV2Elements == 0) {
14478     // Try to use broadcast unless the mask only has one non-undef element.
14479     if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14480       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14481                                                       Mask, Subtarget, DAG))
14482         return Broadcast;
14483     }
14484 
14485     // Straight shuffle of a single input vector. For everything from SSE2
14486     // onward this has a single fast instruction with no scary immediates.
14487     // We coerce the shuffle pattern to be compatible with UNPCK instructions
14488     // but we aren't actually going to use the UNPCK instruction because doing
14489     // so prevents folding a load into this instruction or making a copy.
14490     const int UnpackLoMask[] = {0, 0, 1, 1};
14491     const int UnpackHiMask[] = {2, 2, 3, 3};
14492     if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14493       Mask = UnpackLoMask;
14494     else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14495       Mask = UnpackHiMask;
14496 
14497     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14498                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14499   }
14500 
14501   if (Subtarget.hasAVX2())
14502     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14503       return Extract;
14504 
14505   // Try to use shift instructions.
14506   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14507                                           Zeroable, Subtarget, DAG))
14508     return Shift;
14509 
14510   // There are special ways we can lower some single-element blends.
14511   if (NumV2Elements == 1)
14512     if (SDValue V = lowerShuffleAsElementInsertion(
14513             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14514       return V;
14515 
14516   // We have different paths for blend lowering, but they all must use the
14517   // *exact* same predicate.
14518   bool IsBlendSupported = Subtarget.hasSSE41();
14519   if (IsBlendSupported)
14520     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14521                                             Zeroable, Subtarget, DAG))
14522       return Blend;
14523 
14524   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14525                                              Zeroable, Subtarget, DAG))
14526     return Masked;
14527 
14528   // Use dedicated unpack instructions for masks that match their pattern.
14529   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14530     return V;
14531 
14532   // Try to use byte rotation instructions.
14533   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14534   if (Subtarget.hasSSSE3()) {
14535     if (Subtarget.hasVLX())
14536       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14537                                                 Subtarget, DAG))
14538         return Rotate;
14539 
14540     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14541                                                   Subtarget, DAG))
14542       return Rotate;
14543   }
14544 
14545   // Assume that a single SHUFPS is faster than an alternative sequence of
14546   // multiple instructions (even if the CPU has a domain penalty).
14547   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14548   if (!isSingleSHUFPSMask(Mask)) {
14549     // If we have direct support for blends, we should lower by decomposing into
14550     // a permute. That will be faster than the domain cross.
14551     if (IsBlendSupported)
14552       return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14553                                                   Subtarget, DAG);
14554 
14555     // Try to lower by permuting the inputs into an unpack instruction.
14556     if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14557                                                         Mask, Subtarget, DAG))
14558       return Unpack;
14559   }
14560 
14561   // We implement this with SHUFPS because it can blend from two vectors.
14562   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14563   // up the inputs, bypassing domain shift penalties that we would incur if we
14564   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14565   // relevant.
14566   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14567   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14568   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14569   return DAG.getBitcast(MVT::v4i32, ShufPS);
14570 }
14571 
14572 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14573 /// shuffle lowering, and the most complex part.
14574 ///
14575 /// The lowering strategy is to try to form pairs of input lanes which are
14576 /// targeted at the same half of the final vector, and then use a dword shuffle
14577 /// to place them onto the right half, and finally unpack the paired lanes into
14578 /// their final position.
14579 ///
14580 /// The exact breakdown of how to form these dword pairs and align them on the
14581 /// correct sides is really tricky. See the comments within the function for
14582 /// more of the details.
14583 ///
14584 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14585 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14586 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14587 /// vector, form the analogous 128-bit 8-element Mask.
14588 static SDValue lowerV8I16GeneralSingleInputShuffle(
14589     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14590     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14591   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14592   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14593 
14594   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14595   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14596   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14597 
14598   // Attempt to directly match PSHUFLW or PSHUFHW.
14599   if (isUndefOrInRange(LoMask, 0, 4) &&
14600       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14601     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14602                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14603   }
14604   if (isUndefOrInRange(HiMask, 4, 8) &&
14605       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14606     for (int i = 0; i != 4; ++i)
14607       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14608     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14609                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14610   }
14611 
14612   SmallVector<int, 4> LoInputs;
14613   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14614   array_pod_sort(LoInputs.begin(), LoInputs.end());
14615   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14616   SmallVector<int, 4> HiInputs;
14617   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14618   array_pod_sort(HiInputs.begin(), HiInputs.end());
14619   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14620   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14621   int NumHToL = LoInputs.size() - NumLToL;
14622   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14623   int NumHToH = HiInputs.size() - NumLToH;
14624   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14625   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14626   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14627   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14628 
14629   // If we are shuffling values from one half - check how many different DWORD
14630   // pairs we need to create. If only 1 or 2 then we can perform this as a
14631   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14632   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14633                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14634     V = DAG.getNode(ShufWOp, DL, VT, V,
14635                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14636     V = DAG.getBitcast(PSHUFDVT, V);
14637     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14638                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14639     return DAG.getBitcast(VT, V);
14640   };
14641 
14642   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14643     int PSHUFDMask[4] = { -1, -1, -1, -1 };
14644     SmallVector<std::pair<int, int>, 4> DWordPairs;
14645     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14646 
14647     // Collect the different DWORD pairs.
14648     for (int DWord = 0; DWord != 4; ++DWord) {
14649       int M0 = Mask[2 * DWord + 0];
14650       int M1 = Mask[2 * DWord + 1];
14651       M0 = (M0 >= 0 ? M0 % 4 : M0);
14652       M1 = (M1 >= 0 ? M1 % 4 : M1);
14653       if (M0 < 0 && M1 < 0)
14654         continue;
14655 
14656       bool Match = false;
14657       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14658         auto &DWordPair = DWordPairs[j];
14659         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14660             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14661           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14662           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14663           PSHUFDMask[DWord] = DOffset + j;
14664           Match = true;
14665           break;
14666         }
14667       }
14668       if (!Match) {
14669         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14670         DWordPairs.push_back(std::make_pair(M0, M1));
14671       }
14672     }
14673 
14674     if (DWordPairs.size() <= 2) {
14675       DWordPairs.resize(2, std::make_pair(-1, -1));
14676       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14677                               DWordPairs[1].first, DWordPairs[1].second};
14678       if ((NumHToL + NumHToH) == 0)
14679         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14680       if ((NumLToL + NumLToH) == 0)
14681         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14682     }
14683   }
14684 
14685   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14686   // such inputs we can swap two of the dwords across the half mark and end up
14687   // with <=2 inputs to each half in each half. Once there, we can fall through
14688   // to the generic code below. For example:
14689   //
14690   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14691   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14692   //
14693   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14694   // and an existing 2-into-2 on the other half. In this case we may have to
14695   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14696   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14697   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14698   // because any other situation (including a 3-into-1 or 1-into-3 in the other
14699   // half than the one we target for fixing) will be fixed when we re-enter this
14700   // path. We will also combine away any sequence of PSHUFD instructions that
14701   // result into a single instruction. Here is an example of the tricky case:
14702   //
14703   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14704   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14705   //
14706   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14707   //
14708   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14709   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14710   //
14711   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14712   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14713   //
14714   // The result is fine to be handled by the generic logic.
14715   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14716                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14717                           int AOffset, int BOffset) {
14718     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14719            "Must call this with A having 3 or 1 inputs from the A half.");
14720     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14721            "Must call this with B having 1 or 3 inputs from the B half.");
14722     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14723            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14724 
14725     bool ThreeAInputs = AToAInputs.size() == 3;
14726 
14727     // Compute the index of dword with only one word among the three inputs in
14728     // a half by taking the sum of the half with three inputs and subtracting
14729     // the sum of the actual three inputs. The difference is the remaining
14730     // slot.
14731     int ADWord = 0, BDWord = 0;
14732     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14733     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14734     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14735     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14736     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14737     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14738     int TripleNonInputIdx =
14739         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14740     TripleDWord = TripleNonInputIdx / 2;
14741 
14742     // We use xor with one to compute the adjacent DWord to whichever one the
14743     // OneInput is in.
14744     OneInputDWord = (OneInput / 2) ^ 1;
14745 
14746     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14747     // and BToA inputs. If there is also such a problem with the BToB and AToB
14748     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14749     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14750     // is essential that we don't *create* a 3<-1 as then we might oscillate.
14751     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14752       // Compute how many inputs will be flipped by swapping these DWords. We
14753       // need
14754       // to balance this to ensure we don't form a 3-1 shuffle in the other
14755       // half.
14756       int NumFlippedAToBInputs =
14757           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14758           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14759       int NumFlippedBToBInputs =
14760           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14761           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14762       if ((NumFlippedAToBInputs == 1 &&
14763            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14764           (NumFlippedBToBInputs == 1 &&
14765            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14766         // We choose whether to fix the A half or B half based on whether that
14767         // half has zero flipped inputs. At zero, we may not be able to fix it
14768         // with that half. We also bias towards fixing the B half because that
14769         // will more commonly be the high half, and we have to bias one way.
14770         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14771                                                        ArrayRef<int> Inputs) {
14772           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14773           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14774           // Determine whether the free index is in the flipped dword or the
14775           // unflipped dword based on where the pinned index is. We use this bit
14776           // in an xor to conditionally select the adjacent dword.
14777           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14778           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14779           if (IsFixIdxInput == IsFixFreeIdxInput)
14780             FixFreeIdx += 1;
14781           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14782           assert(IsFixIdxInput != IsFixFreeIdxInput &&
14783                  "We need to be changing the number of flipped inputs!");
14784           int PSHUFHalfMask[] = {0, 1, 2, 3};
14785           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14786           V = DAG.getNode(
14787               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14788               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14789               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14790 
14791           for (int &M : Mask)
14792             if (M >= 0 && M == FixIdx)
14793               M = FixFreeIdx;
14794             else if (M >= 0 && M == FixFreeIdx)
14795               M = FixIdx;
14796         };
14797         if (NumFlippedBToBInputs != 0) {
14798           int BPinnedIdx =
14799               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14800           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14801         } else {
14802           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14803           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14804           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14805         }
14806       }
14807     }
14808 
14809     int PSHUFDMask[] = {0, 1, 2, 3};
14810     PSHUFDMask[ADWord] = BDWord;
14811     PSHUFDMask[BDWord] = ADWord;
14812     V = DAG.getBitcast(
14813         VT,
14814         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14815                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14816 
14817     // Adjust the mask to match the new locations of A and B.
14818     for (int &M : Mask)
14819       if (M >= 0 && M/2 == ADWord)
14820         M = 2 * BDWord + M % 2;
14821       else if (M >= 0 && M/2 == BDWord)
14822         M = 2 * ADWord + M % 2;
14823 
14824     // Recurse back into this routine to re-compute state now that this isn't
14825     // a 3 and 1 problem.
14826     return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14827   };
14828   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14829     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14830   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14831     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14832 
14833   // At this point there are at most two inputs to the low and high halves from
14834   // each half. That means the inputs can always be grouped into dwords and
14835   // those dwords can then be moved to the correct half with a dword shuffle.
14836   // We use at most one low and one high word shuffle to collect these paired
14837   // inputs into dwords, and finally a dword shuffle to place them.
14838   int PSHUFLMask[4] = {-1, -1, -1, -1};
14839   int PSHUFHMask[4] = {-1, -1, -1, -1};
14840   int PSHUFDMask[4] = {-1, -1, -1, -1};
14841 
14842   // First fix the masks for all the inputs that are staying in their
14843   // original halves. This will then dictate the targets of the cross-half
14844   // shuffles.
14845   auto fixInPlaceInputs =
14846       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14847                     MutableArrayRef<int> SourceHalfMask,
14848                     MutableArrayRef<int> HalfMask, int HalfOffset) {
14849     if (InPlaceInputs.empty())
14850       return;
14851     if (InPlaceInputs.size() == 1) {
14852       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14853           InPlaceInputs[0] - HalfOffset;
14854       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14855       return;
14856     }
14857     if (IncomingInputs.empty()) {
14858       // Just fix all of the in place inputs.
14859       for (int Input : InPlaceInputs) {
14860         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14861         PSHUFDMask[Input / 2] = Input / 2;
14862       }
14863       return;
14864     }
14865 
14866     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14867     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14868         InPlaceInputs[0] - HalfOffset;
14869     // Put the second input next to the first so that they are packed into
14870     // a dword. We find the adjacent index by toggling the low bit.
14871     int AdjIndex = InPlaceInputs[0] ^ 1;
14872     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14873     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14874     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14875   };
14876   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14877   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14878 
14879   // Now gather the cross-half inputs and place them into a free dword of
14880   // their target half.
14881   // FIXME: This operation could almost certainly be simplified dramatically to
14882   // look more like the 3-1 fixing operation.
14883   auto moveInputsToRightHalf = [&PSHUFDMask](
14884       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14885       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14886       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14887       int DestOffset) {
14888     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14889       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14890     };
14891     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14892                                                int Word) {
14893       int LowWord = Word & ~1;
14894       int HighWord = Word | 1;
14895       return isWordClobbered(SourceHalfMask, LowWord) ||
14896              isWordClobbered(SourceHalfMask, HighWord);
14897     };
14898 
14899     if (IncomingInputs.empty())
14900       return;
14901 
14902     if (ExistingInputs.empty()) {
14903       // Map any dwords with inputs from them into the right half.
14904       for (int Input : IncomingInputs) {
14905         // If the source half mask maps over the inputs, turn those into
14906         // swaps and use the swapped lane.
14907         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14908           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14909             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14910                 Input - SourceOffset;
14911             // We have to swap the uses in our half mask in one sweep.
14912             for (int &M : HalfMask)
14913               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14914                 M = Input;
14915               else if (M == Input)
14916                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14917           } else {
14918             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14919                        Input - SourceOffset &&
14920                    "Previous placement doesn't match!");
14921           }
14922           // Note that this correctly re-maps both when we do a swap and when
14923           // we observe the other side of the swap above. We rely on that to
14924           // avoid swapping the members of the input list directly.
14925           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14926         }
14927 
14928         // Map the input's dword into the correct half.
14929         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14930           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14931         else
14932           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14933                      Input / 2 &&
14934                  "Previous placement doesn't match!");
14935       }
14936 
14937       // And just directly shift any other-half mask elements to be same-half
14938       // as we will have mirrored the dword containing the element into the
14939       // same position within that half.
14940       for (int &M : HalfMask)
14941         if (M >= SourceOffset && M < SourceOffset + 4) {
14942           M = M - SourceOffset + DestOffset;
14943           assert(M >= 0 && "This should never wrap below zero!");
14944         }
14945       return;
14946     }
14947 
14948     // Ensure we have the input in a viable dword of its current half. This
14949     // is particularly tricky because the original position may be clobbered
14950     // by inputs being moved and *staying* in that half.
14951     if (IncomingInputs.size() == 1) {
14952       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14953         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14954                          SourceOffset;
14955         SourceHalfMask[InputFixed - SourceOffset] =
14956             IncomingInputs[0] - SourceOffset;
14957         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14958                      InputFixed);
14959         IncomingInputs[0] = InputFixed;
14960       }
14961     } else if (IncomingInputs.size() == 2) {
14962       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14963           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14964         // We have two non-adjacent or clobbered inputs we need to extract from
14965         // the source half. To do this, we need to map them into some adjacent
14966         // dword slot in the source mask.
14967         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14968                               IncomingInputs[1] - SourceOffset};
14969 
14970         // If there is a free slot in the source half mask adjacent to one of
14971         // the inputs, place the other input in it. We use (Index XOR 1) to
14972         // compute an adjacent index.
14973         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14974             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14975           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14976           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14977           InputsFixed[1] = InputsFixed[0] ^ 1;
14978         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14979                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14980           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14981           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14982           InputsFixed[0] = InputsFixed[1] ^ 1;
14983         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14984                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14985           // The two inputs are in the same DWord but it is clobbered and the
14986           // adjacent DWord isn't used at all. Move both inputs to the free
14987           // slot.
14988           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14989           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14990           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14991           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14992         } else {
14993           // The only way we hit this point is if there is no clobbering
14994           // (because there are no off-half inputs to this half) and there is no
14995           // free slot adjacent to one of the inputs. In this case, we have to
14996           // swap an input with a non-input.
14997           for (int i = 0; i < 4; ++i)
14998             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14999                    "We can't handle any clobbers here!");
15000           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
15001                  "Cannot have adjacent inputs here!");
15002 
15003           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15004           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15005 
15006           // We also have to update the final source mask in this case because
15007           // it may need to undo the above swap.
15008           for (int &M : FinalSourceHalfMask)
15009             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15010               M = InputsFixed[1] + SourceOffset;
15011             else if (M == InputsFixed[1] + SourceOffset)
15012               M = (InputsFixed[0] ^ 1) + SourceOffset;
15013 
15014           InputsFixed[1] = InputsFixed[0] ^ 1;
15015         }
15016 
15017         // Point everything at the fixed inputs.
15018         for (int &M : HalfMask)
15019           if (M == IncomingInputs[0])
15020             M = InputsFixed[0] + SourceOffset;
15021           else if (M == IncomingInputs[1])
15022             M = InputsFixed[1] + SourceOffset;
15023 
15024         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15025         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15026       }
15027     } else {
15028       llvm_unreachable("Unhandled input size!");
15029     }
15030 
15031     // Now hoist the DWord down to the right half.
15032     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15033     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
15034     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15035     for (int &M : HalfMask)
15036       for (int Input : IncomingInputs)
15037         if (M == Input)
15038           M = FreeDWord * 2 + Input % 2;
15039   };
15040   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15041                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
15042   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15043                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
15044 
15045   // Now enact all the shuffles we've computed to move the inputs into their
15046   // target half.
15047   if (!isNoopShuffleMask(PSHUFLMask))
15048     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15049                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15050   if (!isNoopShuffleMask(PSHUFHMask))
15051     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15052                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15053   if (!isNoopShuffleMask(PSHUFDMask))
15054     V = DAG.getBitcast(
15055         VT,
15056         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15057                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15058 
15059   // At this point, each half should contain all its inputs, and we can then
15060   // just shuffle them into their final position.
15061   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
15062          "Failed to lift all the high half inputs to the low mask!");
15063   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
15064          "Failed to lift all the low half inputs to the high mask!");
15065 
15066   // Do a half shuffle for the low mask.
15067   if (!isNoopShuffleMask(LoMask))
15068     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15069                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15070 
15071   // Do a half shuffle with the high mask after shifting its values down.
15072   for (int &M : HiMask)
15073     if (M >= 0)
15074       M -= 4;
15075   if (!isNoopShuffleMask(HiMask))
15076     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15077                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15078 
15079   return V;
15080 }
15081 
15082 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15083 /// blend if only one input is used.
15084 static SDValue lowerShuffleAsBlendOfPSHUFBs(
15085     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15086     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15087   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
15088          "Lane crossing shuffle masks not supported");
15089 
15090   int NumBytes = VT.getSizeInBits() / 8;
15091   int Size = Mask.size();
15092   int Scale = NumBytes / Size;
15093 
15094   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15095   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15096   V1InUse = false;
15097   V2InUse = false;
15098 
15099   for (int i = 0; i < NumBytes; ++i) {
15100     int M = Mask[i / Scale];
15101     if (M < 0)
15102       continue;
15103 
15104     const int ZeroMask = 0x80;
15105     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15106     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15107     if (Zeroable[i / Scale])
15108       V1Idx = V2Idx = ZeroMask;
15109 
15110     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15111     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15112     V1InUse |= (ZeroMask != V1Idx);
15113     V2InUse |= (ZeroMask != V2Idx);
15114   }
15115 
15116   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15117   if (V1InUse)
15118     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15119                      DAG.getBuildVector(ShufVT, DL, V1Mask));
15120   if (V2InUse)
15121     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15122                      DAG.getBuildVector(ShufVT, DL, V2Mask));
15123 
15124   // If we need shuffled inputs from both, blend the two.
15125   SDValue V;
15126   if (V1InUse && V2InUse)
15127     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15128   else
15129     V = V1InUse ? V1 : V2;
15130 
15131   // Cast the result back to the correct type.
15132   return DAG.getBitcast(VT, V);
15133 }
15134 
15135 /// Generic lowering of 8-lane i16 shuffles.
15136 ///
15137 /// This handles both single-input shuffles and combined shuffle/blends with
15138 /// two inputs. The single input shuffles are immediately delegated to
15139 /// a dedicated lowering routine.
15140 ///
15141 /// The blends are lowered in one of three fundamental ways. If there are few
15142 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15143 /// of the input is significantly cheaper when lowered as an interleaving of
15144 /// the two inputs, try to interleave them. Otherwise, blend the low and high
15145 /// halves of the inputs separately (making them have relatively few inputs)
15146 /// and then concatenate them.
15147 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15148                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15149                                  const X86Subtarget &Subtarget,
15150                                  SelectionDAG &DAG) {
15151   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15152   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15153   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15154 
15155   // Whenever we can lower this as a zext, that instruction is strictly faster
15156   // than any alternative.
15157   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15158                                                    Zeroable, Subtarget, DAG))
15159     return ZExt;
15160 
15161   // Try to use lower using a truncation.
15162   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15163                                         Subtarget, DAG))
15164     return V;
15165 
15166   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15167 
15168   if (NumV2Inputs == 0) {
15169     // Try to use shift instructions.
15170     if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15171                                             Zeroable, Subtarget, DAG))
15172       return Shift;
15173 
15174     // Check for being able to broadcast a single element.
15175     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15176                                                     Mask, Subtarget, DAG))
15177       return Broadcast;
15178 
15179     // Try to use bit rotation instructions.
15180     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15181                                                  Subtarget, DAG))
15182       return Rotate;
15183 
15184     // Use dedicated unpack instructions for masks that match their pattern.
15185     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15186       return V;
15187 
15188     // Use dedicated pack instructions for masks that match their pattern.
15189     if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15190                                          Subtarget))
15191       return V;
15192 
15193     // Try to use byte rotation instructions.
15194     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15195                                                   Subtarget, DAG))
15196       return Rotate;
15197 
15198     // Make a copy of the mask so it can be modified.
15199     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15200     return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15201                                                Subtarget, DAG);
15202   }
15203 
15204   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15205          "All single-input shuffles should be canonicalized to be V1-input "
15206          "shuffles.");
15207 
15208   // Try to use shift instructions.
15209   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15210                                           Zeroable, Subtarget, DAG))
15211     return Shift;
15212 
15213   // See if we can use SSE4A Extraction / Insertion.
15214   if (Subtarget.hasSSE4A())
15215     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15216                                           Zeroable, DAG))
15217       return V;
15218 
15219   // There are special ways we can lower some single-element blends.
15220   if (NumV2Inputs == 1)
15221     if (SDValue V = lowerShuffleAsElementInsertion(
15222             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15223       return V;
15224 
15225   // We have different paths for blend lowering, but they all must use the
15226   // *exact* same predicate.
15227   bool IsBlendSupported = Subtarget.hasSSE41();
15228   if (IsBlendSupported)
15229     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15230                                             Zeroable, Subtarget, DAG))
15231       return Blend;
15232 
15233   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15234                                              Zeroable, Subtarget, DAG))
15235     return Masked;
15236 
15237   // Use dedicated unpack instructions for masks that match their pattern.
15238   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15239     return V;
15240 
15241   // Use dedicated pack instructions for masks that match their pattern.
15242   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15243                                        Subtarget))
15244     return V;
15245 
15246   // Try to use lower using a truncation.
15247   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15248                                        Subtarget, DAG))
15249     return V;
15250 
15251   // Try to use byte rotation instructions.
15252   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15253                                                 Subtarget, DAG))
15254     return Rotate;
15255 
15256   if (SDValue BitBlend =
15257           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15258     return BitBlend;
15259 
15260   // Try to use byte shift instructions to mask.
15261   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15262                                               Zeroable, Subtarget, DAG))
15263     return V;
15264 
15265   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15266   // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15267   // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15268   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15269   if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15270       !Subtarget.hasVLX()) {
15271     SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15272     for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15273       DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15274     SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15275     V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15276                      DWordClearMask);
15277     V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15278                      DWordClearMask);
15279     // Now pack things back together.
15280     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15281     if (NumEvenDrops == 2) {
15282       Result = DAG.getBitcast(MVT::v4i32, Result);
15283       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15284     }
15285     return Result;
15286   }
15287 
15288   // Try to lower by permuting the inputs into an unpack instruction.
15289   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15290                                                       Mask, Subtarget, DAG))
15291     return Unpack;
15292 
15293   // If we can't directly blend but can use PSHUFB, that will be better as it
15294   // can both shuffle and set up the inefficient blend.
15295   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15296     bool V1InUse, V2InUse;
15297     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15298                                         Zeroable, DAG, V1InUse, V2InUse);
15299   }
15300 
15301   // We can always bit-blend if we have to so the fallback strategy is to
15302   // decompose into single-input permutes and blends/unpacks.
15303   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15304                                               Mask, Subtarget, DAG);
15305 }
15306 
15307 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15308 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15309 // the active subvector is extracted.
15310 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15311                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
15312                                      const X86Subtarget &Subtarget,
15313                                      SelectionDAG &DAG) {
15314   MVT MaskVT = VT.changeTypeToInteger();
15315   SDValue MaskNode;
15316   MVT ShuffleVT = VT;
15317   if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15318     V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15319     V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15320     ShuffleVT = V1.getSimpleValueType();
15321 
15322     // Adjust mask to correct indices for the second input.
15323     int NumElts = VT.getVectorNumElements();
15324     unsigned Scale = 512 / VT.getSizeInBits();
15325     SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15326     for (int &M : AdjustedMask)
15327       if (NumElts <= M)
15328         M += (Scale - 1) * NumElts;
15329     MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15330     MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15331   } else {
15332     MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15333   }
15334 
15335   SDValue Result;
15336   if (V2.isUndef())
15337     Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15338   else
15339     Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15340 
15341   if (VT != ShuffleVT)
15342     Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15343 
15344   return Result;
15345 }
15346 
15347 /// Generic lowering of v16i8 shuffles.
15348 ///
15349 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15350 /// detect any complexity reducing interleaving. If that doesn't help, it uses
15351 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15352 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15353 /// back together.
15354 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15355                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15356                                  const X86Subtarget &Subtarget,
15357                                  SelectionDAG &DAG) {
15358   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15359   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15360   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15361 
15362   // Try to use shift instructions.
15363   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15364                                           Zeroable, Subtarget, DAG))
15365     return Shift;
15366 
15367   // Try to use byte rotation instructions.
15368   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15369                                                 Subtarget, DAG))
15370     return Rotate;
15371 
15372   // Use dedicated pack instructions for masks that match their pattern.
15373   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15374                                        Subtarget))
15375     return V;
15376 
15377   // Try to use a zext lowering.
15378   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15379                                                    Zeroable, Subtarget, DAG))
15380     return ZExt;
15381 
15382   // Try to use lower using a truncation.
15383   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15384                                         Subtarget, DAG))
15385     return V;
15386 
15387   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15388                                        Subtarget, DAG))
15389     return V;
15390 
15391   // See if we can use SSE4A Extraction / Insertion.
15392   if (Subtarget.hasSSE4A())
15393     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15394                                           Zeroable, DAG))
15395       return V;
15396 
15397   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15398 
15399   // For single-input shuffles, there are some nicer lowering tricks we can use.
15400   if (NumV2Elements == 0) {
15401     // Check for being able to broadcast a single element.
15402     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15403                                                     Mask, Subtarget, DAG))
15404       return Broadcast;
15405 
15406     // Try to use bit rotation instructions.
15407     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15408                                                  Subtarget, DAG))
15409       return Rotate;
15410 
15411     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15412       return V;
15413 
15414     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15415     // Notably, this handles splat and partial-splat shuffles more efficiently.
15416     // However, it only makes sense if the pre-duplication shuffle simplifies
15417     // things significantly. Currently, this means we need to be able to
15418     // express the pre-duplication shuffle as an i16 shuffle.
15419     //
15420     // FIXME: We should check for other patterns which can be widened into an
15421     // i16 shuffle as well.
15422     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15423       for (int i = 0; i < 16; i += 2)
15424         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15425           return false;
15426 
15427       return true;
15428     };
15429     auto tryToWidenViaDuplication = [&]() -> SDValue {
15430       if (!canWidenViaDuplication(Mask))
15431         return SDValue();
15432       SmallVector<int, 4> LoInputs;
15433       copy_if(Mask, std::back_inserter(LoInputs),
15434               [](int M) { return M >= 0 && M < 8; });
15435       array_pod_sort(LoInputs.begin(), LoInputs.end());
15436       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15437                      LoInputs.end());
15438       SmallVector<int, 4> HiInputs;
15439       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15440       array_pod_sort(HiInputs.begin(), HiInputs.end());
15441       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15442                      HiInputs.end());
15443 
15444       bool TargetLo = LoInputs.size() >= HiInputs.size();
15445       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15446       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15447 
15448       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15449       SmallDenseMap<int, int, 8> LaneMap;
15450       for (int I : InPlaceInputs) {
15451         PreDupI16Shuffle[I/2] = I/2;
15452         LaneMap[I] = I;
15453       }
15454       int j = TargetLo ? 0 : 4, je = j + 4;
15455       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15456         // Check if j is already a shuffle of this input. This happens when
15457         // there are two adjacent bytes after we move the low one.
15458         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15459           // If we haven't yet mapped the input, search for a slot into which
15460           // we can map it.
15461           while (j < je && PreDupI16Shuffle[j] >= 0)
15462             ++j;
15463 
15464           if (j == je)
15465             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15466             return SDValue();
15467 
15468           // Map this input with the i16 shuffle.
15469           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15470         }
15471 
15472         // Update the lane map based on the mapping we ended up with.
15473         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15474       }
15475       V1 = DAG.getBitcast(
15476           MVT::v16i8,
15477           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15478                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15479 
15480       // Unpack the bytes to form the i16s that will be shuffled into place.
15481       bool EvenInUse = false, OddInUse = false;
15482       for (int i = 0; i < 16; i += 2) {
15483         EvenInUse |= (Mask[i + 0] >= 0);
15484         OddInUse |= (Mask[i + 1] >= 0);
15485         if (EvenInUse && OddInUse)
15486           break;
15487       }
15488       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15489                        MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15490                        OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15491 
15492       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15493       for (int i = 0; i < 16; ++i)
15494         if (Mask[i] >= 0) {
15495           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15496           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15497           if (PostDupI16Shuffle[i / 2] < 0)
15498             PostDupI16Shuffle[i / 2] = MappedMask;
15499           else
15500             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15501                    "Conflicting entries in the original shuffle!");
15502         }
15503       return DAG.getBitcast(
15504           MVT::v16i8,
15505           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15506                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15507     };
15508     if (SDValue V = tryToWidenViaDuplication())
15509       return V;
15510   }
15511 
15512   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15513                                              Zeroable, Subtarget, DAG))
15514     return Masked;
15515 
15516   // Use dedicated unpack instructions for masks that match their pattern.
15517   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15518     return V;
15519 
15520   // Try to use byte shift instructions to mask.
15521   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15522                                               Zeroable, Subtarget, DAG))
15523     return V;
15524 
15525   // Check for compaction patterns.
15526   bool IsSingleInput = V2.isUndef();
15527   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15528 
15529   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15530   // with PSHUFB. It is important to do this before we attempt to generate any
15531   // blends but after all of the single-input lowerings. If the single input
15532   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15533   // want to preserve that and we can DAG combine any longer sequences into
15534   // a PSHUFB in the end. But once we start blending from multiple inputs,
15535   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15536   // and there are *very* few patterns that would actually be faster than the
15537   // PSHUFB approach because of its ability to zero lanes.
15538   //
15539   // If the mask is a binary compaction, we can more efficiently perform this
15540   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15541   //
15542   // FIXME: The only exceptions to the above are blends which are exact
15543   // interleavings with direct instructions supporting them. We currently don't
15544   // handle those well here.
15545   if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15546     bool V1InUse = false;
15547     bool V2InUse = false;
15548 
15549     SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15550         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15551 
15552     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15553     // do so. This avoids using them to handle blends-with-zero which is
15554     // important as a single pshufb is significantly faster for that.
15555     if (V1InUse && V2InUse) {
15556       if (Subtarget.hasSSE41())
15557         if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15558                                                 Zeroable, Subtarget, DAG))
15559           return Blend;
15560 
15561       // We can use an unpack to do the blending rather than an or in some
15562       // cases. Even though the or may be (very minorly) more efficient, we
15563       // preference this lowering because there are common cases where part of
15564       // the complexity of the shuffles goes away when we do the final blend as
15565       // an unpack.
15566       // FIXME: It might be worth trying to detect if the unpack-feeding
15567       // shuffles will both be pshufb, in which case we shouldn't bother with
15568       // this.
15569       if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15570               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15571         return Unpack;
15572 
15573       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15574       if (Subtarget.hasVBMI())
15575         return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15576                                      DAG);
15577 
15578       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15579       if (Subtarget.hasXOP()) {
15580         SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15581         return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15582       }
15583 
15584       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15585       // PALIGNR will be cheaper than the second PSHUFB+OR.
15586       if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15587               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15588         return V;
15589     }
15590 
15591     return PSHUFB;
15592   }
15593 
15594   // There are special ways we can lower some single-element blends.
15595   if (NumV2Elements == 1)
15596     if (SDValue V = lowerShuffleAsElementInsertion(
15597             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15598       return V;
15599 
15600   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15601     return Blend;
15602 
15603   // Check whether a compaction lowering can be done. This handles shuffles
15604   // which take every Nth element for some even N. See the helper function for
15605   // details.
15606   //
15607   // We special case these as they can be particularly efficiently handled with
15608   // the PACKUSB instruction on x86 and they show up in common patterns of
15609   // rearranging bytes to truncate wide elements.
15610   if (NumEvenDrops) {
15611     // NumEvenDrops is the power of two stride of the elements. Another way of
15612     // thinking about it is that we need to drop the even elements this many
15613     // times to get the original input.
15614 
15615     // First we need to zero all the dropped bytes.
15616     assert(NumEvenDrops <= 3 &&
15617            "No support for dropping even elements more than 3 times.");
15618     SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15619     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15620       WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15621     SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15622     V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15623                      WordClearMask);
15624     if (!IsSingleInput)
15625       V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15626                        WordClearMask);
15627 
15628     // Now pack things back together.
15629     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15630                                  IsSingleInput ? V1 : V2);
15631     for (int i = 1; i < NumEvenDrops; ++i) {
15632       Result = DAG.getBitcast(MVT::v8i16, Result);
15633       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15634     }
15635     return Result;
15636   }
15637 
15638   // Handle multi-input cases by blending/unpacking single-input shuffles.
15639   if (NumV2Elements > 0)
15640     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15641                                                 Subtarget, DAG);
15642 
15643   // The fallback path for single-input shuffles widens this into two v8i16
15644   // vectors with unpacks, shuffles those, and then pulls them back together
15645   // with a pack.
15646   SDValue V = V1;
15647 
15648   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15649   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15650   for (int i = 0; i < 16; ++i)
15651     if (Mask[i] >= 0)
15652       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15653 
15654   SDValue VLoHalf, VHiHalf;
15655   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15656   // them out and avoid using UNPCK{L,H} to extract the elements of V as
15657   // i16s.
15658   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15659       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15660     // Use a mask to drop the high bytes.
15661     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15662     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15663                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
15664 
15665     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15666     VHiHalf = DAG.getUNDEF(MVT::v8i16);
15667 
15668     // Squash the masks to point directly into VLoHalf.
15669     for (int &M : LoBlendMask)
15670       if (M >= 0)
15671         M /= 2;
15672     for (int &M : HiBlendMask)
15673       if (M >= 0)
15674         M /= 2;
15675   } else {
15676     // Otherwise just unpack the low half of V into VLoHalf and the high half into
15677     // VHiHalf so that we can blend them as i16s.
15678     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15679 
15680     VLoHalf = DAG.getBitcast(
15681         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15682     VHiHalf = DAG.getBitcast(
15683         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15684   }
15685 
15686   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15687   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15688 
15689   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15690 }
15691 
15692 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
15693 ///
15694 /// This routine breaks down the specific type of 128-bit shuffle and
15695 /// dispatches to the lowering routines accordingly.
15696 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15697                                   MVT VT, SDValue V1, SDValue V2,
15698                                   const APInt &Zeroable,
15699                                   const X86Subtarget &Subtarget,
15700                                   SelectionDAG &DAG) {
15701   switch (VT.SimpleTy) {
15702   case MVT::v2i64:
15703     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15704   case MVT::v2f64:
15705     return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15706   case MVT::v4i32:
15707     return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15708   case MVT::v4f32:
15709     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15710   case MVT::v8i16:
15711     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15712   case MVT::v16i8:
15713     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15714 
15715   default:
15716     llvm_unreachable("Unimplemented!");
15717   }
15718 }
15719 
15720 /// Generic routine to split vector shuffle into half-sized shuffles.
15721 ///
15722 /// This routine just extracts two subvectors, shuffles them independently, and
15723 /// then concatenates them back together. This should work effectively with all
15724 /// AVX vector shuffle types.
15725 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15726                                     SDValue V2, ArrayRef<int> Mask,
15727                                     SelectionDAG &DAG) {
15728   assert(VT.getSizeInBits() >= 256 &&
15729          "Only for 256-bit or wider vector shuffles!");
15730   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15731   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15732 
15733   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15734   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15735 
15736   int NumElements = VT.getVectorNumElements();
15737   int SplitNumElements = NumElements / 2;
15738   MVT ScalarVT = VT.getVectorElementType();
15739   MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15740 
15741   // Use splitVector/extractSubVector so that split build-vectors just build two
15742   // narrower build vectors. This helps shuffling with splats and zeros.
15743   auto SplitVector = [&](SDValue V) {
15744     SDValue LoV, HiV;
15745     std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15746     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15747                           DAG.getBitcast(SplitVT, HiV));
15748   };
15749 
15750   SDValue LoV1, HiV1, LoV2, HiV2;
15751   std::tie(LoV1, HiV1) = SplitVector(V1);
15752   std::tie(LoV2, HiV2) = SplitVector(V2);
15753 
15754   // Now create two 4-way blends of these half-width vectors.
15755   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15756     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15757     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15758     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15759     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15760     for (int i = 0; i < SplitNumElements; ++i) {
15761       int M = HalfMask[i];
15762       if (M >= NumElements) {
15763         if (M >= NumElements + SplitNumElements)
15764           UseHiV2 = true;
15765         else
15766           UseLoV2 = true;
15767         V2BlendMask[i] = M - NumElements;
15768         BlendMask[i] = SplitNumElements + i;
15769       } else if (M >= 0) {
15770         if (M >= SplitNumElements)
15771           UseHiV1 = true;
15772         else
15773           UseLoV1 = true;
15774         V1BlendMask[i] = M;
15775         BlendMask[i] = i;
15776       }
15777     }
15778 
15779     // Because the lowering happens after all combining takes place, we need to
15780     // manually combine these blend masks as much as possible so that we create
15781     // a minimal number of high-level vector shuffle nodes.
15782 
15783     // First try just blending the halves of V1 or V2.
15784     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15785       return DAG.getUNDEF(SplitVT);
15786     if (!UseLoV2 && !UseHiV2)
15787       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15788     if (!UseLoV1 && !UseHiV1)
15789       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15790 
15791     SDValue V1Blend, V2Blend;
15792     if (UseLoV1 && UseHiV1) {
15793       V1Blend =
15794         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15795     } else {
15796       // We only use half of V1 so map the usage down into the final blend mask.
15797       V1Blend = UseLoV1 ? LoV1 : HiV1;
15798       for (int i = 0; i < SplitNumElements; ++i)
15799         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15800           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15801     }
15802     if (UseLoV2 && UseHiV2) {
15803       V2Blend =
15804         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15805     } else {
15806       // We only use half of V2 so map the usage down into the final blend mask.
15807       V2Blend = UseLoV2 ? LoV2 : HiV2;
15808       for (int i = 0; i < SplitNumElements; ++i)
15809         if (BlendMask[i] >= SplitNumElements)
15810           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15811     }
15812     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15813   };
15814   SDValue Lo = HalfBlend(LoMask);
15815   SDValue Hi = HalfBlend(HiMask);
15816   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15817 }
15818 
15819 /// Either split a vector in halves or decompose the shuffles and the
15820 /// blend/unpack.
15821 ///
15822 /// This is provided as a good fallback for many lowerings of non-single-input
15823 /// shuffles with more than one 128-bit lane. In those cases, we want to select
15824 /// between splitting the shuffle into 128-bit components and stitching those
15825 /// back together vs. extracting the single-input shuffles and blending those
15826 /// results.
15827 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15828                                           SDValue V2, ArrayRef<int> Mask,
15829                                           const X86Subtarget &Subtarget,
15830                                           SelectionDAG &DAG) {
15831   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15832          "shuffles as it could then recurse on itself.");
15833   int Size = Mask.size();
15834 
15835   // If this can be modeled as a broadcast of two elements followed by a blend,
15836   // prefer that lowering. This is especially important because broadcasts can
15837   // often fold with memory operands.
15838   auto DoBothBroadcast = [&] {
15839     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15840     for (int M : Mask)
15841       if (M >= Size) {
15842         if (V2BroadcastIdx < 0)
15843           V2BroadcastIdx = M - Size;
15844         else if (M - Size != V2BroadcastIdx)
15845           return false;
15846       } else if (M >= 0) {
15847         if (V1BroadcastIdx < 0)
15848           V1BroadcastIdx = M;
15849         else if (M != V1BroadcastIdx)
15850           return false;
15851       }
15852     return true;
15853   };
15854   if (DoBothBroadcast())
15855     return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15856                                                 DAG);
15857 
15858   // If the inputs all stem from a single 128-bit lane of each input, then we
15859   // split them rather than blending because the split will decompose to
15860   // unusually few instructions.
15861   int LaneCount = VT.getSizeInBits() / 128;
15862   int LaneSize = Size / LaneCount;
15863   SmallBitVector LaneInputs[2];
15864   LaneInputs[0].resize(LaneCount, false);
15865   LaneInputs[1].resize(LaneCount, false);
15866   for (int i = 0; i < Size; ++i)
15867     if (Mask[i] >= 0)
15868       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15869   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15870     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15871 
15872   // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15873   // requires that the decomposed single-input shuffles don't end up here.
15874   return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15875                                               DAG);
15876 }
15877 
15878 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15879 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
15880 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15881                                                  SDValue V1, SDValue V2,
15882                                                  ArrayRef<int> Mask,
15883                                                  SelectionDAG &DAG) {
15884   assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15885 
15886   int LHSMask[4] = {-1, -1, -1, -1};
15887   int RHSMask[4] = {-1, -1, -1, -1};
15888   unsigned SHUFPMask = 0;
15889 
15890   // As SHUFPD uses a single LHS/RHS element per lane, we can always
15891   // perform the shuffle once the lanes have been shuffled in place.
15892   for (int i = 0; i != 4; ++i) {
15893     int M = Mask[i];
15894     if (M < 0)
15895       continue;
15896     int LaneBase = i & ~1;
15897     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15898     LaneMask[LaneBase + (M & 1)] = M;
15899     SHUFPMask |= (M & 1) << i;
15900   }
15901 
15902   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15903   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15904   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15905                      DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15906 }
15907 
15908 /// Lower a vector shuffle crossing multiple 128-bit lanes as
15909 /// a lane permutation followed by a per-lane permutation.
15910 ///
15911 /// This is mainly for cases where we can have non-repeating permutes
15912 /// in each lane.
15913 ///
15914 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15915 /// we should investigate merging them.
15916 static SDValue lowerShuffleAsLanePermuteAndPermute(
15917     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15918     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15919   int NumElts = VT.getVectorNumElements();
15920   int NumLanes = VT.getSizeInBits() / 128;
15921   int NumEltsPerLane = NumElts / NumLanes;
15922   bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15923 
15924   /// Attempts to find a sublane permute with the given size
15925   /// that gets all elements into their target lanes.
15926   ///
15927   /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15928   /// If unsuccessful, returns false and may overwrite InLaneMask.
15929   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15930     int NumSublanesPerLane = NumSublanes / NumLanes;
15931     int NumEltsPerSublane = NumElts / NumSublanes;
15932 
15933     SmallVector<int, 16> CrossLaneMask;
15934     SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15935     // CrossLaneMask but one entry == one sublane.
15936     SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15937 
15938     for (int i = 0; i != NumElts; ++i) {
15939       int M = Mask[i];
15940       if (M < 0)
15941         continue;
15942 
15943       int SrcSublane = M / NumEltsPerSublane;
15944       int DstLane = i / NumEltsPerLane;
15945 
15946       // We only need to get the elements into the right lane, not sublane.
15947       // So search all sublanes that make up the destination lane.
15948       bool Found = false;
15949       int DstSubStart = DstLane * NumSublanesPerLane;
15950       int DstSubEnd = DstSubStart + NumSublanesPerLane;
15951       for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15952         if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15953           continue;
15954 
15955         Found = true;
15956         CrossLaneMaskLarge[DstSublane] = SrcSublane;
15957         int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15958         InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15959         break;
15960       }
15961       if (!Found)
15962         return SDValue();
15963     }
15964 
15965     // Fill CrossLaneMask using CrossLaneMaskLarge.
15966     narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15967 
15968     if (!CanUseSublanes) {
15969       // If we're only shuffling a single lowest lane and the rest are identity
15970       // then don't bother.
15971       // TODO - isShuffleMaskInputInPlace could be extended to something like
15972       // this.
15973       int NumIdentityLanes = 0;
15974       bool OnlyShuffleLowestLane = true;
15975       for (int i = 0; i != NumLanes; ++i) {
15976         int LaneOffset = i * NumEltsPerLane;
15977         if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15978                                        i * NumEltsPerLane))
15979           NumIdentityLanes++;
15980         else if (CrossLaneMask[LaneOffset] != 0)
15981           OnlyShuffleLowestLane = false;
15982       }
15983       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15984         return SDValue();
15985     }
15986 
15987     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15988     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15989                                 InLaneMask);
15990   };
15991 
15992   // First attempt a solution with full lanes.
15993   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15994     return V;
15995 
15996   // The rest of the solutions use sublanes.
15997   if (!CanUseSublanes)
15998     return SDValue();
15999 
16000   // Then attempt a solution with 64-bit sublanes (vpermq).
16001   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16002     return V;
16003 
16004   // If that doesn't work and we have fast variable cross-lane shuffle,
16005   // attempt 32-bit sublanes (vpermd).
16006   if (!Subtarget.hasFastVariableCrossLaneShuffle())
16007     return SDValue();
16008 
16009   return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16010 }
16011 
16012 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16013 /// source with a lane permutation.
16014 ///
16015 /// This lowering strategy results in four instructions in the worst case for a
16016 /// single-input cross lane shuffle which is lower than any other fully general
16017 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16018 /// shuffle pattern should be handled prior to trying this lowering.
16019 static SDValue lowerShuffleAsLanePermuteAndShuffle(
16020     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16021     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16022   // FIXME: This should probably be generalized for 512-bit vectors as well.
16023   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
16024   int Size = Mask.size();
16025   int LaneSize = Size / 2;
16026 
16027   // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16028   // Only do this if the elements aren't all from the lower lane,
16029   // otherwise we're (probably) better off doing a split.
16030   if (VT == MVT::v4f64 &&
16031       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16032     if (SDValue V =
16033             lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16034       return V;
16035 
16036   // If there are only inputs from one 128-bit lane, splitting will in fact be
16037   // less expensive. The flags track whether the given lane contains an element
16038   // that crosses to another lane.
16039   if (!Subtarget.hasAVX2()) {
16040     bool LaneCrossing[2] = {false, false};
16041     for (int i = 0; i < Size; ++i)
16042       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16043         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16044     if (!LaneCrossing[0] || !LaneCrossing[1])
16045       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16046   } else {
16047     bool LaneUsed[2] = {false, false};
16048     for (int i = 0; i < Size; ++i)
16049       if (Mask[i] >= 0)
16050         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16051     if (!LaneUsed[0] || !LaneUsed[1])
16052       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16053   }
16054 
16055   // TODO - we could support shuffling V2 in the Flipped input.
16056   assert(V2.isUndef() &&
16057          "This last part of this routine only works on single input shuffles");
16058 
16059   SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16060   for (int i = 0; i < Size; ++i) {
16061     int &M = InLaneMask[i];
16062     if (M < 0)
16063       continue;
16064     if (((M % Size) / LaneSize) != (i / LaneSize))
16065       M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16066   }
16067   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16068          "In-lane shuffle mask expected");
16069 
16070   // Flip the lanes, and shuffle the results which should now be in-lane.
16071   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16072   SDValue Flipped = DAG.getBitcast(PVT, V1);
16073   Flipped =
16074       DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16075   Flipped = DAG.getBitcast(VT, Flipped);
16076   return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16077 }
16078 
16079 /// Handle lowering 2-lane 128-bit shuffles.
16080 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16081                                   SDValue V2, ArrayRef<int> Mask,
16082                                   const APInt &Zeroable,
16083                                   const X86Subtarget &Subtarget,
16084                                   SelectionDAG &DAG) {
16085   if (V2.isUndef()) {
16086     // Attempt to match VBROADCAST*128 subvector broadcast load.
16087     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16088     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16089     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16090         MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16091       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16092       if (!Ld->isNonTemporal()) {
16093         MVT MemVT = VT.getHalfNumVectorElementsVT();
16094         unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16095         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16096         SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16097                                                TypeSize::Fixed(Ofs), DL);
16098         SDValue Ops[] = {Ld->getChain(), Ptr};
16099         SDValue BcastLd = DAG.getMemIntrinsicNode(
16100             X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16101             DAG.getMachineFunction().getMachineMemOperand(
16102                 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16103         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16104         return BcastLd;
16105       }
16106     }
16107 
16108     // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16109     if (Subtarget.hasAVX2())
16110       return SDValue();
16111   }
16112 
16113   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16114 
16115   SmallVector<int, 4> WidenedMask;
16116   if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16117     return SDValue();
16118 
16119   bool IsLowZero = (Zeroable & 0x3) == 0x3;
16120   bool IsHighZero = (Zeroable & 0xc) == 0xc;
16121 
16122   // Try to use an insert into a zero vector.
16123   if (WidenedMask[0] == 0 && IsHighZero) {
16124     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16125     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16126                               DAG.getIntPtrConstant(0, DL));
16127     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16128                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
16129                        DAG.getIntPtrConstant(0, DL));
16130   }
16131 
16132   // TODO: If minimizing size and one of the inputs is a zero vector and the
16133   // the zero vector has only one use, we could use a VPERM2X128 to save the
16134   // instruction bytes needed to explicitly generate the zero vector.
16135 
16136   // Blends are faster and handle all the non-lane-crossing cases.
16137   if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16138                                           Subtarget, DAG))
16139     return Blend;
16140 
16141   // If either input operand is a zero vector, use VPERM2X128 because its mask
16142   // allows us to replace the zero input with an implicit zero.
16143   if (!IsLowZero && !IsHighZero) {
16144     // Check for patterns which can be matched with a single insert of a 128-bit
16145     // subvector.
16146     bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16147     if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16148 
16149       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16150       // this will likely become vinsertf128 which can't fold a 256-bit memop.
16151       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16152         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16153         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16154                                      OnlyUsesV1 ? V1 : V2,
16155                                      DAG.getIntPtrConstant(0, DL));
16156         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16157                            DAG.getIntPtrConstant(2, DL));
16158       }
16159     }
16160 
16161     // Try to use SHUF128 if possible.
16162     if (Subtarget.hasVLX()) {
16163       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16164         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16165                             ((WidenedMask[1] % 2) << 1);
16166         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16167                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
16168       }
16169     }
16170   }
16171 
16172   // Otherwise form a 128-bit permutation. After accounting for undefs,
16173   // convert the 64-bit shuffle mask selection values into 128-bit
16174   // selection bits by dividing the indexes by 2 and shifting into positions
16175   // defined by a vperm2*128 instruction's immediate control byte.
16176 
16177   // The immediate permute control byte looks like this:
16178   //    [1:0] - select 128 bits from sources for low half of destination
16179   //    [2]   - ignore
16180   //    [3]   - zero low half of destination
16181   //    [5:4] - select 128 bits from sources for high half of destination
16182   //    [6]   - ignore
16183   //    [7]   - zero high half of destination
16184 
16185   assert((WidenedMask[0] >= 0 || IsLowZero) &&
16186          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16187 
16188   unsigned PermMask = 0;
16189   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
16190   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16191 
16192   // Check the immediate mask and replace unused sources with undef.
16193   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16194     V1 = DAG.getUNDEF(VT);
16195   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16196     V2 = DAG.getUNDEF(VT);
16197 
16198   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16199                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
16200 }
16201 
16202 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
16203 /// shuffling each lane.
16204 ///
16205 /// This attempts to create a repeated lane shuffle where each lane uses one
16206 /// or two of the lanes of the inputs. The lanes of the input vectors are
16207 /// shuffled in one or two independent shuffles to get the lanes into the
16208 /// position needed by the final shuffle.
16209 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16210     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16211     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16212   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
16213 
16214   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16215     return SDValue();
16216 
16217   int NumElts = Mask.size();
16218   int NumLanes = VT.getSizeInBits() / 128;
16219   int NumLaneElts = 128 / VT.getScalarSizeInBits();
16220   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16221   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16222 
16223   // First pass will try to fill in the RepeatMask from lanes that need two
16224   // sources.
16225   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16226     int Srcs[2] = {-1, -1};
16227     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16228     for (int i = 0; i != NumLaneElts; ++i) {
16229       int M = Mask[(Lane * NumLaneElts) + i];
16230       if (M < 0)
16231         continue;
16232       // Determine which of the possible input lanes (NumLanes from each source)
16233       // this element comes from. Assign that as one of the sources for this
16234       // lane. We can assign up to 2 sources for this lane. If we run out
16235       // sources we can't do anything.
16236       int LaneSrc = M / NumLaneElts;
16237       int Src;
16238       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16239         Src = 0;
16240       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16241         Src = 1;
16242       else
16243         return SDValue();
16244 
16245       Srcs[Src] = LaneSrc;
16246       InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16247     }
16248 
16249     // If this lane has two sources, see if it fits with the repeat mask so far.
16250     if (Srcs[1] < 0)
16251       continue;
16252 
16253     LaneSrcs[Lane][0] = Srcs[0];
16254     LaneSrcs[Lane][1] = Srcs[1];
16255 
16256     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16257       assert(M1.size() == M2.size() && "Unexpected mask size");
16258       for (int i = 0, e = M1.size(); i != e; ++i)
16259         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16260           return false;
16261       return true;
16262     };
16263 
16264     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16265       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16266       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16267         int M = Mask[i];
16268         if (M < 0)
16269           continue;
16270         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16271                "Unexpected mask element");
16272         MergedMask[i] = M;
16273       }
16274     };
16275 
16276     if (MatchMasks(InLaneMask, RepeatMask)) {
16277       // Merge this lane mask into the final repeat mask.
16278       MergeMasks(InLaneMask, RepeatMask);
16279       continue;
16280     }
16281 
16282     // Didn't find a match. Swap the operands and try again.
16283     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16284     ShuffleVectorSDNode::commuteMask(InLaneMask);
16285 
16286     if (MatchMasks(InLaneMask, RepeatMask)) {
16287       // Merge this lane mask into the final repeat mask.
16288       MergeMasks(InLaneMask, RepeatMask);
16289       continue;
16290     }
16291 
16292     // Couldn't find a match with the operands in either order.
16293     return SDValue();
16294   }
16295 
16296   // Now handle any lanes with only one source.
16297   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16298     // If this lane has already been processed, skip it.
16299     if (LaneSrcs[Lane][0] >= 0)
16300       continue;
16301 
16302     for (int i = 0; i != NumLaneElts; ++i) {
16303       int M = Mask[(Lane * NumLaneElts) + i];
16304       if (M < 0)
16305         continue;
16306 
16307       // If RepeatMask isn't defined yet we can define it ourself.
16308       if (RepeatMask[i] < 0)
16309         RepeatMask[i] = M % NumLaneElts;
16310 
16311       if (RepeatMask[i] < NumElts) {
16312         if (RepeatMask[i] != M % NumLaneElts)
16313           return SDValue();
16314         LaneSrcs[Lane][0] = M / NumLaneElts;
16315       } else {
16316         if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16317           return SDValue();
16318         LaneSrcs[Lane][1] = M / NumLaneElts;
16319       }
16320     }
16321 
16322     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16323       return SDValue();
16324   }
16325 
16326   SmallVector<int, 16> NewMask(NumElts, -1);
16327   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16328     int Src = LaneSrcs[Lane][0];
16329     for (int i = 0; i != NumLaneElts; ++i) {
16330       int M = -1;
16331       if (Src >= 0)
16332         M = Src * NumLaneElts + i;
16333       NewMask[Lane * NumLaneElts + i] = M;
16334     }
16335   }
16336   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16337   // Ensure we didn't get back the shuffle we started with.
16338   // FIXME: This is a hack to make up for some splat handling code in
16339   // getVectorShuffle.
16340   if (isa<ShuffleVectorSDNode>(NewV1) &&
16341       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16342     return SDValue();
16343 
16344   for (int Lane = 0; Lane != NumLanes; ++Lane) {
16345     int Src = LaneSrcs[Lane][1];
16346     for (int i = 0; i != NumLaneElts; ++i) {
16347       int M = -1;
16348       if (Src >= 0)
16349         M = Src * NumLaneElts + i;
16350       NewMask[Lane * NumLaneElts + i] = M;
16351     }
16352   }
16353   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16354   // Ensure we didn't get back the shuffle we started with.
16355   // FIXME: This is a hack to make up for some splat handling code in
16356   // getVectorShuffle.
16357   if (isa<ShuffleVectorSDNode>(NewV2) &&
16358       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16359     return SDValue();
16360 
16361   for (int i = 0; i != NumElts; ++i) {
16362     NewMask[i] = RepeatMask[i % NumLaneElts];
16363     if (NewMask[i] < 0)
16364       continue;
16365 
16366     NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16367   }
16368   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16369 }
16370 
16371 /// If the input shuffle mask results in a vector that is undefined in all upper
16372 /// or lower half elements and that mask accesses only 2 halves of the
16373 /// shuffle's operands, return true. A mask of half the width with mask indexes
16374 /// adjusted to access the extracted halves of the original shuffle operands is
16375 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16376 /// lower half of each input operand is accessed.
16377 static bool
16378 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16379                    int &HalfIdx1, int &HalfIdx2) {
16380   assert((Mask.size() == HalfMask.size() * 2) &&
16381          "Expected input mask to be twice as long as output");
16382 
16383   // Exactly one half of the result must be undef to allow narrowing.
16384   bool UndefLower = isUndefLowerHalf(Mask);
16385   bool UndefUpper = isUndefUpperHalf(Mask);
16386   if (UndefLower == UndefUpper)
16387     return false;
16388 
16389   unsigned HalfNumElts = HalfMask.size();
16390   unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16391   HalfIdx1 = -1;
16392   HalfIdx2 = -1;
16393   for (unsigned i = 0; i != HalfNumElts; ++i) {
16394     int M = Mask[i + MaskIndexOffset];
16395     if (M < 0) {
16396       HalfMask[i] = M;
16397       continue;
16398     }
16399 
16400     // Determine which of the 4 half vectors this element is from.
16401     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16402     int HalfIdx = M / HalfNumElts;
16403 
16404     // Determine the element index into its half vector source.
16405     int HalfElt = M % HalfNumElts;
16406 
16407     // We can shuffle with up to 2 half vectors, set the new 'half'
16408     // shuffle mask accordingly.
16409     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16410       HalfMask[i] = HalfElt;
16411       HalfIdx1 = HalfIdx;
16412       continue;
16413     }
16414     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16415       HalfMask[i] = HalfElt + HalfNumElts;
16416       HalfIdx2 = HalfIdx;
16417       continue;
16418     }
16419 
16420     // Too many half vectors referenced.
16421     return false;
16422   }
16423 
16424   return true;
16425 }
16426 
16427 /// Given the output values from getHalfShuffleMask(), create a half width
16428 /// shuffle of extracted vectors followed by an insert back to full width.
16429 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16430                                      ArrayRef<int> HalfMask, int HalfIdx1,
16431                                      int HalfIdx2, bool UndefLower,
16432                                      SelectionDAG &DAG, bool UseConcat = false) {
16433   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16434   assert(V1.getValueType().isSimple() && "Expecting only simple types");
16435 
16436   MVT VT = V1.getSimpleValueType();
16437   MVT HalfVT = VT.getHalfNumVectorElementsVT();
16438   unsigned HalfNumElts = HalfVT.getVectorNumElements();
16439 
16440   auto getHalfVector = [&](int HalfIdx) {
16441     if (HalfIdx < 0)
16442       return DAG.getUNDEF(HalfVT);
16443     SDValue V = (HalfIdx < 2 ? V1 : V2);
16444     HalfIdx = (HalfIdx % 2) * HalfNumElts;
16445     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16446                        DAG.getIntPtrConstant(HalfIdx, DL));
16447   };
16448 
16449   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16450   SDValue Half1 = getHalfVector(HalfIdx1);
16451   SDValue Half2 = getHalfVector(HalfIdx2);
16452   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16453   if (UseConcat) {
16454     SDValue Op0 = V;
16455     SDValue Op1 = DAG.getUNDEF(HalfVT);
16456     if (UndefLower)
16457       std::swap(Op0, Op1);
16458     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16459   }
16460 
16461   unsigned Offset = UndefLower ? HalfNumElts : 0;
16462   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16463                      DAG.getIntPtrConstant(Offset, DL));
16464 }
16465 
16466 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16467 /// This allows for fast cases such as subvector extraction/insertion
16468 /// or shuffling smaller vector types which can lower more efficiently.
16469 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16470                                          SDValue V2, ArrayRef<int> Mask,
16471                                          const X86Subtarget &Subtarget,
16472                                          SelectionDAG &DAG) {
16473   assert((VT.is256BitVector() || VT.is512BitVector()) &&
16474          "Expected 256-bit or 512-bit vector");
16475 
16476   bool UndefLower = isUndefLowerHalf(Mask);
16477   if (!UndefLower && !isUndefUpperHalf(Mask))
16478     return SDValue();
16479 
16480   assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16481          "Completely undef shuffle mask should have been simplified already");
16482 
16483   // Upper half is undef and lower half is whole upper subvector.
16484   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16485   MVT HalfVT = VT.getHalfNumVectorElementsVT();
16486   unsigned HalfNumElts = HalfVT.getVectorNumElements();
16487   if (!UndefLower &&
16488       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16489     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16490                              DAG.getIntPtrConstant(HalfNumElts, DL));
16491     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16492                        DAG.getIntPtrConstant(0, DL));
16493   }
16494 
16495   // Lower half is undef and upper half is whole lower subvector.
16496   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16497   if (UndefLower &&
16498       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16499     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16500                              DAG.getIntPtrConstant(0, DL));
16501     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16502                        DAG.getIntPtrConstant(HalfNumElts, DL));
16503   }
16504 
16505   int HalfIdx1, HalfIdx2;
16506   SmallVector<int, 8> HalfMask(HalfNumElts);
16507   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16508     return SDValue();
16509 
16510   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16511 
16512   // Only shuffle the halves of the inputs when useful.
16513   unsigned NumLowerHalves =
16514       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16515   unsigned NumUpperHalves =
16516       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16517   assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16518 
16519   // Determine the larger pattern of undef/halves, then decide if it's worth
16520   // splitting the shuffle based on subtarget capabilities and types.
16521   unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16522   if (!UndefLower) {
16523     // XXXXuuuu: no insert is needed.
16524     // Always extract lowers when setting lower - these are all free subreg ops.
16525     if (NumUpperHalves == 0)
16526       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16527                                    UndefLower, DAG);
16528 
16529     if (NumUpperHalves == 1) {
16530       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16531       if (Subtarget.hasAVX2()) {
16532         // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16533         if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16534             !is128BitUnpackShuffleMask(HalfMask) &&
16535             (!isSingleSHUFPSMask(HalfMask) ||
16536              Subtarget.hasFastVariableCrossLaneShuffle()))
16537           return SDValue();
16538         // If this is a unary shuffle (assume that the 2nd operand is
16539         // canonicalized to undef), then we can use vpermpd. Otherwise, we
16540         // are better off extracting the upper half of 1 operand and using a
16541         // narrow shuffle.
16542         if (EltWidth == 64 && V2.isUndef())
16543           return SDValue();
16544       }
16545       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16546       if (Subtarget.hasAVX512() && VT.is512BitVector())
16547         return SDValue();
16548       // Extract + narrow shuffle is better than the wide alternative.
16549       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16550                                    UndefLower, DAG);
16551     }
16552 
16553     // Don't extract both uppers, instead shuffle and then extract.
16554     assert(NumUpperHalves == 2 && "Half vector count went wrong");
16555     return SDValue();
16556   }
16557 
16558   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16559   if (NumUpperHalves == 0) {
16560     // AVX2 has efficient 64-bit element cross-lane shuffles.
16561     // TODO: Refine to account for unary shuffle, splat, and other masks?
16562     if (Subtarget.hasAVX2() && EltWidth == 64)
16563       return SDValue();
16564     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16565     if (Subtarget.hasAVX512() && VT.is512BitVector())
16566       return SDValue();
16567     // Narrow shuffle + insert is better than the wide alternative.
16568     return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16569                                  UndefLower, DAG);
16570   }
16571 
16572   // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16573   return SDValue();
16574 }
16575 
16576 /// Test whether the specified input (0 or 1) is in-place blended by the
16577 /// given mask.
16578 ///
16579 /// This returns true if the elements from a particular input are already in the
16580 /// slot required by the given mask and require no permutation.
16581 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16582   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
16583   int Size = Mask.size();
16584   for (int i = 0; i < Size; ++i)
16585     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16586       return false;
16587 
16588   return true;
16589 }
16590 
16591 /// Handle case where shuffle sources are coming from the same 128-bit lane and
16592 /// every lane can be represented as the same repeating mask - allowing us to
16593 /// shuffle the sources with the repeating shuffle and then permute the result
16594 /// to the destination lanes.
16595 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16596     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16597     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16598   int NumElts = VT.getVectorNumElements();
16599   int NumLanes = VT.getSizeInBits() / 128;
16600   int NumLaneElts = NumElts / NumLanes;
16601 
16602   // On AVX2 we may be able to just shuffle the lowest elements and then
16603   // broadcast the result.
16604   if (Subtarget.hasAVX2()) {
16605     for (unsigned BroadcastSize : {16, 32, 64}) {
16606       if (BroadcastSize <= VT.getScalarSizeInBits())
16607         continue;
16608       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16609 
16610       // Attempt to match a repeating pattern every NumBroadcastElts,
16611       // accounting for UNDEFs but only references the lowest 128-bit
16612       // lane of the inputs.
16613       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16614         for (int i = 0; i != NumElts; i += NumBroadcastElts)
16615           for (int j = 0; j != NumBroadcastElts; ++j) {
16616             int M = Mask[i + j];
16617             if (M < 0)
16618               continue;
16619             int &R = RepeatMask[j];
16620             if (0 != ((M % NumElts) / NumLaneElts))
16621               return false;
16622             if (0 <= R && R != M)
16623               return false;
16624             R = M;
16625           }
16626         return true;
16627       };
16628 
16629       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16630       if (!FindRepeatingBroadcastMask(RepeatMask))
16631         continue;
16632 
16633       // Shuffle the (lowest) repeated elements in place for broadcast.
16634       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16635 
16636       // Shuffle the actual broadcast.
16637       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16638       for (int i = 0; i != NumElts; i += NumBroadcastElts)
16639         for (int j = 0; j != NumBroadcastElts; ++j)
16640           BroadcastMask[i + j] = j;
16641       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16642                                   BroadcastMask);
16643     }
16644   }
16645 
16646   // Bail if the shuffle mask doesn't cross 128-bit lanes.
16647   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16648     return SDValue();
16649 
16650   // Bail if we already have a repeated lane shuffle mask.
16651   SmallVector<int, 8> RepeatedShuffleMask;
16652   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16653     return SDValue();
16654 
16655   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16656   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16657   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16658   int NumSubLanes = NumLanes * SubLaneScale;
16659   int NumSubLaneElts = NumLaneElts / SubLaneScale;
16660 
16661   // Check that all the sources are coming from the same lane and see if we can
16662   // form a repeating shuffle mask (local to each sub-lane). At the same time,
16663   // determine the source sub-lane for each destination sub-lane.
16664   int TopSrcSubLane = -1;
16665   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16666   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16667       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16668       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16669 
16670   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16671     // Extract the sub-lane mask, check that it all comes from the same lane
16672     // and normalize the mask entries to come from the first lane.
16673     int SrcLane = -1;
16674     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16675     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16676       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16677       if (M < 0)
16678         continue;
16679       int Lane = (M % NumElts) / NumLaneElts;
16680       if ((0 <= SrcLane) && (SrcLane != Lane))
16681         return SDValue();
16682       SrcLane = Lane;
16683       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16684       SubLaneMask[Elt] = LocalM;
16685     }
16686 
16687     // Whole sub-lane is UNDEF.
16688     if (SrcLane < 0)
16689       continue;
16690 
16691     // Attempt to match against the candidate repeated sub-lane masks.
16692     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16693       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16694         for (int i = 0; i != NumSubLaneElts; ++i) {
16695           if (M1[i] < 0 || M2[i] < 0)
16696             continue;
16697           if (M1[i] != M2[i])
16698             return false;
16699         }
16700         return true;
16701       };
16702 
16703       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16704       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16705         continue;
16706 
16707       // Merge the sub-lane mask into the matching repeated sub-lane mask.
16708       for (int i = 0; i != NumSubLaneElts; ++i) {
16709         int M = SubLaneMask[i];
16710         if (M < 0)
16711           continue;
16712         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16713                "Unexpected mask element");
16714         RepeatedSubLaneMask[i] = M;
16715       }
16716 
16717       // Track the top most source sub-lane - by setting the remaining to UNDEF
16718       // we can greatly simplify shuffle matching.
16719       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16720       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16721       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16722       break;
16723     }
16724 
16725     // Bail if we failed to find a matching repeated sub-lane mask.
16726     if (Dst2SrcSubLanes[DstSubLane] < 0)
16727       return SDValue();
16728   }
16729   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16730          "Unexpected source lane");
16731 
16732   // Create a repeating shuffle mask for the entire vector.
16733   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16734   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16735     int Lane = SubLane / SubLaneScale;
16736     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16737     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16738       int M = RepeatedSubLaneMask[Elt];
16739       if (M < 0)
16740         continue;
16741       int Idx = (SubLane * NumSubLaneElts) + Elt;
16742       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16743     }
16744   }
16745   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16746 
16747   // Shuffle each source sub-lane to its destination.
16748   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16749   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16750     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16751     if (SrcSubLane < 0)
16752       continue;
16753     for (int j = 0; j != NumSubLaneElts; ++j)
16754       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16755   }
16756 
16757   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16758                               SubLaneMask);
16759 }
16760 
16761 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16762                                    bool &ForceV1Zero, bool &ForceV2Zero,
16763                                    unsigned &ShuffleImm, ArrayRef<int> Mask,
16764                                    const APInt &Zeroable) {
16765   int NumElts = VT.getVectorNumElements();
16766   assert(VT.getScalarSizeInBits() == 64 &&
16767          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16768          "Unexpected data type for VSHUFPD");
16769   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16770          "Illegal shuffle mask");
16771 
16772   bool ZeroLane[2] = { true, true };
16773   for (int i = 0; i < NumElts; ++i)
16774     ZeroLane[i & 1] &= Zeroable[i];
16775 
16776   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
16777   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
16778   ShuffleImm = 0;
16779   bool ShufpdMask = true;
16780   bool CommutableMask = true;
16781   for (int i = 0; i < NumElts; ++i) {
16782     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16783       continue;
16784     if (Mask[i] < 0)
16785       return false;
16786     int Val = (i & 6) + NumElts * (i & 1);
16787     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16788     if (Mask[i] < Val || Mask[i] > Val + 1)
16789       ShufpdMask = false;
16790     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16791       CommutableMask = false;
16792     ShuffleImm |= (Mask[i] % 2) << i;
16793   }
16794 
16795   if (!ShufpdMask && !CommutableMask)
16796     return false;
16797 
16798   if (!ShufpdMask && CommutableMask)
16799     std::swap(V1, V2);
16800 
16801   ForceV1Zero = ZeroLane[0];
16802   ForceV2Zero = ZeroLane[1];
16803   return true;
16804 }
16805 
16806 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16807                                       SDValue V2, ArrayRef<int> Mask,
16808                                       const APInt &Zeroable,
16809                                       const X86Subtarget &Subtarget,
16810                                       SelectionDAG &DAG) {
16811   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16812          "Unexpected data type for VSHUFPD");
16813 
16814   unsigned Immediate = 0;
16815   bool ForceV1Zero = false, ForceV2Zero = false;
16816   if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16817                               Mask, Zeroable))
16818     return SDValue();
16819 
16820   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16821   if (ForceV1Zero)
16822     V1 = getZeroVector(VT, Subtarget, DAG, DL);
16823   if (ForceV2Zero)
16824     V2 = getZeroVector(VT, Subtarget, DAG, DL);
16825 
16826   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16827                      DAG.getTargetConstant(Immediate, DL, MVT::i8));
16828 }
16829 
16830 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16831 // by zeroable elements in the remaining 24 elements. Turn this into two
16832 // vmovqb instructions shuffled together.
16833 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16834                                              SDValue V1, SDValue V2,
16835                                              ArrayRef<int> Mask,
16836                                              const APInt &Zeroable,
16837                                              SelectionDAG &DAG) {
16838   assert(VT == MVT::v32i8 && "Unexpected type!");
16839 
16840   // The first 8 indices should be every 8th element.
16841   if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16842     return SDValue();
16843 
16844   // Remaining elements need to be zeroable.
16845   if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16846     return SDValue();
16847 
16848   V1 = DAG.getBitcast(MVT::v4i64, V1);
16849   V2 = DAG.getBitcast(MVT::v4i64, V2);
16850 
16851   V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16852   V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16853 
16854   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16855   // the upper bits of the result using an unpckldq.
16856   SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16857                                         { 0, 1, 2, 3, 16, 17, 18, 19,
16858                                           4, 5, 6, 7, 20, 21, 22, 23 });
16859   // Insert the unpckldq into a zero vector to widen to v32i8.
16860   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16861                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16862                      DAG.getIntPtrConstant(0, DL));
16863 }
16864 
16865 
16866 /// Handle lowering of 4-lane 64-bit floating point shuffles.
16867 ///
16868 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16869 /// isn't available.
16870 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16871                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16872                                  const X86Subtarget &Subtarget,
16873                                  SelectionDAG &DAG) {
16874   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16875   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16876   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16877 
16878   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16879                                      Subtarget, DAG))
16880     return V;
16881 
16882   if (V2.isUndef()) {
16883     // Check for being able to broadcast a single element.
16884     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16885                                                     Mask, Subtarget, DAG))
16886       return Broadcast;
16887 
16888     // Use low duplicate instructions for masks that match their pattern.
16889     if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16890       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16891 
16892     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16893       // Non-half-crossing single input shuffles can be lowered with an
16894       // interleaved permutation.
16895       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16896                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16897       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16898                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16899     }
16900 
16901     // With AVX2 we have direct support for this permutation.
16902     if (Subtarget.hasAVX2())
16903       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16904                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16905 
16906     // Try to create an in-lane repeating shuffle mask and then shuffle the
16907     // results into the target lanes.
16908     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16909             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16910       return V;
16911 
16912     // Try to permute the lanes and then use a per-lane permute.
16913     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16914                                                         Mask, DAG, Subtarget))
16915       return V;
16916 
16917     // Otherwise, fall back.
16918     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16919                                                DAG, Subtarget);
16920   }
16921 
16922   // Use dedicated unpack instructions for masks that match their pattern.
16923   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16924     return V;
16925 
16926   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16927                                           Zeroable, Subtarget, DAG))
16928     return Blend;
16929 
16930   // Check if the blend happens to exactly fit that of SHUFPD.
16931   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16932                                           Zeroable, Subtarget, DAG))
16933     return Op;
16934 
16935   // If we have lane crossing shuffles AND they don't all come from the lower
16936   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16937   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16938   // canonicalize to a blend of splat which isn't necessary for this combine.
16939   if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16940       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16941       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16942       (V2.getOpcode() != ISD::BUILD_VECTOR))
16943     if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16944                                                        Mask, DAG))
16945       return Op;
16946 
16947   // If we have one input in place, then we can permute the other input and
16948   // blend the result.
16949   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16950     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16951                                                 Subtarget, DAG);
16952 
16953   // Try to create an in-lane repeating shuffle mask and then shuffle the
16954   // results into the target lanes.
16955   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16956           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16957     return V;
16958 
16959   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16960   // shuffle. However, if we have AVX2 and either inputs are already in place,
16961   // we will be able to shuffle even across lanes the other input in a single
16962   // instruction so skip this pattern.
16963   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16964                                 isShuffleMaskInputInPlace(1, Mask))))
16965     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16966             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16967       return V;
16968 
16969   // If we have VLX support, we can use VEXPAND.
16970   if (Subtarget.hasVLX())
16971     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16972                                          DAG, Subtarget))
16973       return V;
16974 
16975   // If we have AVX2 then we always want to lower with a blend because an v4 we
16976   // can fully permute the elements.
16977   if (Subtarget.hasAVX2())
16978     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16979                                                 Subtarget, DAG);
16980 
16981   // Otherwise fall back on generic lowering.
16982   return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16983                                     Subtarget, DAG);
16984 }
16985 
16986 /// Handle lowering of 4-lane 64-bit integer shuffles.
16987 ///
16988 /// This routine is only called when we have AVX2 and thus a reasonable
16989 /// instruction set for v4i64 shuffling..
16990 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16991                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16992                                  const X86Subtarget &Subtarget,
16993                                  SelectionDAG &DAG) {
16994   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16995   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16996   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16997   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16998 
16999   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17000                                      Subtarget, DAG))
17001     return V;
17002 
17003   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17004                                           Zeroable, Subtarget, DAG))
17005     return Blend;
17006 
17007   // Check for being able to broadcast a single element.
17008   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17009                                                   Subtarget, DAG))
17010     return Broadcast;
17011 
17012   if (V2.isUndef()) {
17013     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17014     // can use lower latency instructions that will operate on both lanes.
17015     SmallVector<int, 2> RepeatedMask;
17016     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17017       SmallVector<int, 4> PSHUFDMask;
17018       narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17019       return DAG.getBitcast(
17020           MVT::v4i64,
17021           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17022                       DAG.getBitcast(MVT::v8i32, V1),
17023                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17024     }
17025 
17026     // AVX2 provides a direct instruction for permuting a single input across
17027     // lanes.
17028     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17029                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17030   }
17031 
17032   // Try to use shift instructions.
17033   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17034                                           Zeroable, Subtarget, DAG))
17035     return Shift;
17036 
17037   // If we have VLX support, we can use VALIGN or VEXPAND.
17038   if (Subtarget.hasVLX()) {
17039     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17040                                               Subtarget, DAG))
17041       return Rotate;
17042 
17043     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17044                                          DAG, Subtarget))
17045       return V;
17046   }
17047 
17048   // Try to use PALIGNR.
17049   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17050                                                 Subtarget, DAG))
17051     return Rotate;
17052 
17053   // Use dedicated unpack instructions for masks that match their pattern.
17054   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17055     return V;
17056 
17057   // If we have one input in place, then we can permute the other input and
17058   // blend the result.
17059   if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17060     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17061                                                 Subtarget, DAG);
17062 
17063   // Try to create an in-lane repeating shuffle mask and then shuffle the
17064   // results into the target lanes.
17065   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17066           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17067     return V;
17068 
17069   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17070   // shuffle. However, if we have AVX2 and either inputs are already in place,
17071   // we will be able to shuffle even across lanes the other input in a single
17072   // instruction so skip this pattern.
17073   if (!isShuffleMaskInputInPlace(0, Mask) &&
17074       !isShuffleMaskInputInPlace(1, Mask))
17075     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17076             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17077       return Result;
17078 
17079   // Otherwise fall back on generic blend lowering.
17080   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17081                                               Subtarget, DAG);
17082 }
17083 
17084 /// Handle lowering of 8-lane 32-bit floating point shuffles.
17085 ///
17086 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17087 /// isn't available.
17088 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17089                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17090                                  const X86Subtarget &Subtarget,
17091                                  SelectionDAG &DAG) {
17092   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17093   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17094   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17095 
17096   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17097                                           Zeroable, Subtarget, DAG))
17098     return Blend;
17099 
17100   // Check for being able to broadcast a single element.
17101   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17102                                                   Subtarget, DAG))
17103     return Broadcast;
17104 
17105   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17106   // options to efficiently lower the shuffle.
17107   SmallVector<int, 4> RepeatedMask;
17108   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17109     assert(RepeatedMask.size() == 4 &&
17110            "Repeated masks must be half the mask width!");
17111 
17112     // Use even/odd duplicate instructions for masks that match their pattern.
17113     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17114       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17115     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17116       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17117 
17118     if (V2.isUndef())
17119       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17120                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17121 
17122     // Use dedicated unpack instructions for masks that match their pattern.
17123     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17124       return V;
17125 
17126     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17127     // have already handled any direct blends.
17128     return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17129   }
17130 
17131   // Try to create an in-lane repeating shuffle mask and then shuffle the
17132   // results into the target lanes.
17133   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17134           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17135     return V;
17136 
17137   // If we have a single input shuffle with different shuffle patterns in the
17138   // two 128-bit lanes use the variable mask to VPERMILPS.
17139   if (V2.isUndef()) {
17140     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17141       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17142       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17143     }
17144     if (Subtarget.hasAVX2()) {
17145       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17146       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17147     }
17148     // Otherwise, fall back.
17149     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17150                                                DAG, Subtarget);
17151   }
17152 
17153   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17154   // shuffle.
17155   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17156           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17157     return Result;
17158 
17159   // If we have VLX support, we can use VEXPAND.
17160   if (Subtarget.hasVLX())
17161     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17162                                          DAG, Subtarget))
17163       return V;
17164 
17165   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17166   // since after split we get a more efficient code using vpunpcklwd and
17167   // vpunpckhwd instrs than vblend.
17168   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17169     return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17170                                       DAG);
17171 
17172   // If we have AVX2 then we always want to lower with a blend because at v8 we
17173   // can fully permute the elements.
17174   if (Subtarget.hasAVX2())
17175     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17176                                                 Subtarget, DAG);
17177 
17178   // Otherwise fall back on generic lowering.
17179   return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17180                                     Subtarget, DAG);
17181 }
17182 
17183 /// Handle lowering of 8-lane 32-bit integer shuffles.
17184 ///
17185 /// This routine is only called when we have AVX2 and thus a reasonable
17186 /// instruction set for v8i32 shuffling..
17187 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17188                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17189                                  const X86Subtarget &Subtarget,
17190                                  SelectionDAG &DAG) {
17191   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17192   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17193   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17194   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17195 
17196   // Whenever we can lower this as a zext, that instruction is strictly faster
17197   // than any alternative. It also allows us to fold memory operands into the
17198   // shuffle in many cases.
17199   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17200                                                    Zeroable, Subtarget, DAG))
17201     return ZExt;
17202 
17203   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17204   // since after split we get a more efficient code than vblend by using
17205   // vpunpcklwd and vpunpckhwd instrs.
17206   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17207       !Subtarget.hasAVX512())
17208     return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17209                                       DAG);
17210 
17211   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17212                                           Zeroable, Subtarget, DAG))
17213     return Blend;
17214 
17215   // Check for being able to broadcast a single element.
17216   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17217                                                   Subtarget, DAG))
17218     return Broadcast;
17219 
17220   // If the shuffle mask is repeated in each 128-bit lane we can use more
17221   // efficient instructions that mirror the shuffles across the two 128-bit
17222   // lanes.
17223   SmallVector<int, 4> RepeatedMask;
17224   bool Is128BitLaneRepeatedShuffle =
17225       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17226   if (Is128BitLaneRepeatedShuffle) {
17227     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17228     if (V2.isUndef())
17229       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17230                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17231 
17232     // Use dedicated unpack instructions for masks that match their pattern.
17233     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17234       return V;
17235   }
17236 
17237   // Try to use shift instructions.
17238   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17239                                           Zeroable, Subtarget, DAG))
17240     return Shift;
17241 
17242   // If we have VLX support, we can use VALIGN or EXPAND.
17243   if (Subtarget.hasVLX()) {
17244     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17245                                               Subtarget, DAG))
17246       return Rotate;
17247 
17248     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17249                                          DAG, Subtarget))
17250       return V;
17251   }
17252 
17253   // Try to use byte rotation instructions.
17254   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17255                                                 Subtarget, DAG))
17256     return Rotate;
17257 
17258   // Try to create an in-lane repeating shuffle mask and then shuffle the
17259   // results into the target lanes.
17260   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17261           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17262     return V;
17263 
17264   if (V2.isUndef()) {
17265     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17266     // because that should be faster than the variable permute alternatives.
17267     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17268       return V;
17269 
17270     // If the shuffle patterns aren't repeated but it's a single input, directly
17271     // generate a cross-lane VPERMD instruction.
17272     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17273     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17274   }
17275 
17276   // Assume that a single SHUFPS is faster than an alternative sequence of
17277   // multiple instructions (even if the CPU has a domain penalty).
17278   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17279   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17280     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17281     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17282     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17283                                             CastV1, CastV2, DAG);
17284     return DAG.getBitcast(MVT::v8i32, ShufPS);
17285   }
17286 
17287   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17288   // shuffle.
17289   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17290           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17291     return Result;
17292 
17293   // Otherwise fall back on generic blend lowering.
17294   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17295                                               Subtarget, DAG);
17296 }
17297 
17298 /// Handle lowering of 16-lane 16-bit integer shuffles.
17299 ///
17300 /// This routine is only called when we have AVX2 and thus a reasonable
17301 /// instruction set for v16i16 shuffling..
17302 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17303                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17304                                   const X86Subtarget &Subtarget,
17305                                   SelectionDAG &DAG) {
17306   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17307   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17308   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17309   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17310 
17311   // Whenever we can lower this as a zext, that instruction is strictly faster
17312   // than any alternative. It also allows us to fold memory operands into the
17313   // shuffle in many cases.
17314   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17315           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17316     return ZExt;
17317 
17318   // Check for being able to broadcast a single element.
17319   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17320                                                   Subtarget, DAG))
17321     return Broadcast;
17322 
17323   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17324                                           Zeroable, Subtarget, DAG))
17325     return Blend;
17326 
17327   // Use dedicated unpack instructions for masks that match their pattern.
17328   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17329     return V;
17330 
17331   // Use dedicated pack instructions for masks that match their pattern.
17332   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17333                                        Subtarget))
17334     return V;
17335 
17336   // Try to use lower using a truncation.
17337   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17338                                        Subtarget, DAG))
17339     return V;
17340 
17341   // Try to use shift instructions.
17342   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17343                                           Zeroable, Subtarget, DAG))
17344     return Shift;
17345 
17346   // Try to use byte rotation instructions.
17347   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17348                                                 Subtarget, DAG))
17349     return Rotate;
17350 
17351   // Try to create an in-lane repeating shuffle mask and then shuffle the
17352   // results into the target lanes.
17353   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17354           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17355     return V;
17356 
17357   if (V2.isUndef()) {
17358     // Try to use bit rotation instructions.
17359     if (SDValue Rotate =
17360             lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17361       return Rotate;
17362 
17363     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17364     // because that should be faster than the variable permute alternatives.
17365     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17366       return V;
17367 
17368     // There are no generalized cross-lane shuffle operations available on i16
17369     // element types.
17370     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17371       if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17372               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17373         return V;
17374 
17375       return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17376                                                  DAG, Subtarget);
17377     }
17378 
17379     SmallVector<int, 8> RepeatedMask;
17380     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17381       // As this is a single-input shuffle, the repeated mask should be
17382       // a strictly valid v8i16 mask that we can pass through to the v8i16
17383       // lowering to handle even the v16 case.
17384       return lowerV8I16GeneralSingleInputShuffle(
17385           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17386     }
17387   }
17388 
17389   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17390                                               Zeroable, Subtarget, DAG))
17391     return PSHUFB;
17392 
17393   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17394   if (Subtarget.hasBWI())
17395     return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17396 
17397   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398   // shuffle.
17399   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17400           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17401     return Result;
17402 
17403   // Try to permute the lanes and then use a per-lane permute.
17404   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17405           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17406     return V;
17407 
17408   // Otherwise fall back on generic lowering.
17409   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17410                                     Subtarget, DAG);
17411 }
17412 
17413 /// Handle lowering of 32-lane 8-bit integer shuffles.
17414 ///
17415 /// This routine is only called when we have AVX2 and thus a reasonable
17416 /// instruction set for v32i8 shuffling..
17417 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17418                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17419                                  const X86Subtarget &Subtarget,
17420                                  SelectionDAG &DAG) {
17421   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17422   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17423   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17424   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17425 
17426   // Whenever we can lower this as a zext, that instruction is strictly faster
17427   // than any alternative. It also allows us to fold memory operands into the
17428   // shuffle in many cases.
17429   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17430                                                    Zeroable, Subtarget, DAG))
17431     return ZExt;
17432 
17433   // Check for being able to broadcast a single element.
17434   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17435                                                   Subtarget, DAG))
17436     return Broadcast;
17437 
17438   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17439                                           Zeroable, Subtarget, DAG))
17440     return Blend;
17441 
17442   // Use dedicated unpack instructions for masks that match their pattern.
17443   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17444     return V;
17445 
17446   // Use dedicated pack instructions for masks that match their pattern.
17447   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17448                                        Subtarget))
17449     return V;
17450 
17451   // Try to use lower using a truncation.
17452   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17453                                        Subtarget, DAG))
17454     return V;
17455 
17456   // Try to use shift instructions.
17457   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17458                                           Zeroable, Subtarget, DAG))
17459     return Shift;
17460 
17461   // Try to use byte rotation instructions.
17462   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17463                                                 Subtarget, DAG))
17464     return Rotate;
17465 
17466   // Try to use bit rotation instructions.
17467   if (V2.isUndef())
17468     if (SDValue Rotate =
17469             lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17470       return Rotate;
17471 
17472   // Try to create an in-lane repeating shuffle mask and then shuffle the
17473   // results into the target lanes.
17474   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17475           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17476     return V;
17477 
17478   // There are no generalized cross-lane shuffle operations available on i8
17479   // element types.
17480   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17481     // Try to produce a fixed cross-128-bit lane permute followed by unpack
17482     // because that should be faster than the variable permute alternatives.
17483     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17484       return V;
17485 
17486     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17487             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17488       return V;
17489 
17490     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17491                                                DAG, Subtarget);
17492   }
17493 
17494   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17495                                               Zeroable, Subtarget, DAG))
17496     return PSHUFB;
17497 
17498   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17499   if (Subtarget.hasVBMI())
17500     return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17501 
17502   // Try to simplify this by merging 128-bit lanes to enable a lane-based
17503   // shuffle.
17504   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17505           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17506     return Result;
17507 
17508   // Try to permute the lanes and then use a per-lane permute.
17509   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17510           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17511     return V;
17512 
17513   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17514   // by zeroable elements in the remaining 24 elements. Turn this into two
17515   // vmovqb instructions shuffled together.
17516   if (Subtarget.hasVLX())
17517     if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17518                                                   Mask, Zeroable, DAG))
17519       return V;
17520 
17521   // Otherwise fall back on generic lowering.
17522   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17523                                     Subtarget, DAG);
17524 }
17525 
17526 /// High-level routine to lower various 256-bit x86 vector shuffles.
17527 ///
17528 /// This routine either breaks down the specific type of a 256-bit x86 vector
17529 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
17530 /// together based on the available instructions.
17531 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17532                                   SDValue V1, SDValue V2, const APInt &Zeroable,
17533                                   const X86Subtarget &Subtarget,
17534                                   SelectionDAG &DAG) {
17535   // If we have a single input to the zero element, insert that into V1 if we
17536   // can do so cheaply.
17537   int NumElts = VT.getVectorNumElements();
17538   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17539 
17540   if (NumV2Elements == 1 && Mask[0] >= NumElts)
17541     if (SDValue Insertion = lowerShuffleAsElementInsertion(
17542             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17543       return Insertion;
17544 
17545   // Handle special cases where the lower or upper half is UNDEF.
17546   if (SDValue V =
17547           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17548     return V;
17549 
17550   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17551   // can check for those subtargets here and avoid much of the subtarget
17552   // querying in the per-vector-type lowering routines. With AVX1 we have
17553   // essentially *zero* ability to manipulate a 256-bit vector with integer
17554   // types. Since we'll use floating point types there eventually, just
17555   // immediately cast everything to a float and operate entirely in that domain.
17556   if (VT.isInteger() && !Subtarget.hasAVX2()) {
17557     int ElementBits = VT.getScalarSizeInBits();
17558     if (ElementBits < 32) {
17559       // No floating point type available, if we can't use the bit operations
17560       // for masking/blending then decompose into 128-bit vectors.
17561       if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17562                                             Subtarget, DAG))
17563         return V;
17564       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17565         return V;
17566       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17567     }
17568 
17569     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17570                                 VT.getVectorNumElements());
17571     V1 = DAG.getBitcast(FpVT, V1);
17572     V2 = DAG.getBitcast(FpVT, V2);
17573     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17574   }
17575 
17576   switch (VT.SimpleTy) {
17577   case MVT::v4f64:
17578     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17579   case MVT::v4i64:
17580     return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17581   case MVT::v8f32:
17582     return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17583   case MVT::v8i32:
17584     return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17585   case MVT::v16i16:
17586     return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17587   case MVT::v32i8:
17588     return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17589 
17590   default:
17591     llvm_unreachable("Not a valid 256-bit x86 vector type!");
17592   }
17593 }
17594 
17595 /// Try to lower a vector shuffle as a 128-bit shuffles.
17596 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17597                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17598                                   const X86Subtarget &Subtarget,
17599                                   SelectionDAG &DAG) {
17600   assert(VT.getScalarSizeInBits() == 64 &&
17601          "Unexpected element type size for 128bit shuffle.");
17602 
17603   // To handle 256 bit vector requires VLX and most probably
17604   // function lowerV2X128VectorShuffle() is better solution.
17605   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17606 
17607   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17608   SmallVector<int, 4> Widened128Mask;
17609   if (!canWidenShuffleElements(Mask, Widened128Mask))
17610     return SDValue();
17611   assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17612 
17613   // Try to use an insert into a zero vector.
17614   if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17615       (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17616     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17617     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17618     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17619                               DAG.getIntPtrConstant(0, DL));
17620     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17621                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
17622                        DAG.getIntPtrConstant(0, DL));
17623   }
17624 
17625   // Check for patterns which can be matched with a single insert of a 256-bit
17626   // subvector.
17627   bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17628   if (OnlyUsesV1 ||
17629       isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17630     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17631     SDValue SubVec =
17632         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17633                     DAG.getIntPtrConstant(0, DL));
17634     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17635                        DAG.getIntPtrConstant(4, DL));
17636   }
17637 
17638   // See if this is an insertion of the lower 128-bits of V2 into V1.
17639   bool IsInsert = true;
17640   int V2Index = -1;
17641   for (int i = 0; i < 4; ++i) {
17642     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17643     if (Widened128Mask[i] < 0)
17644       continue;
17645 
17646     // Make sure all V1 subvectors are in place.
17647     if (Widened128Mask[i] < 4) {
17648       if (Widened128Mask[i] != i) {
17649         IsInsert = false;
17650         break;
17651       }
17652     } else {
17653       // Make sure we only have a single V2 index and its the lowest 128-bits.
17654       if (V2Index >= 0 || Widened128Mask[i] != 4) {
17655         IsInsert = false;
17656         break;
17657       }
17658       V2Index = i;
17659     }
17660   }
17661   if (IsInsert && V2Index >= 0) {
17662     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17663     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17664                                  DAG.getIntPtrConstant(0, DL));
17665     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17666   }
17667 
17668   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17669   // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17670   // possible we at least ensure the lanes stay sequential to help later
17671   // combines.
17672   SmallVector<int, 2> Widened256Mask;
17673   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17674     Widened128Mask.clear();
17675     narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17676   }
17677 
17678   // Try to lower to vshuf64x2/vshuf32x4.
17679   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17680   unsigned PermMask = 0;
17681   // Insure elements came from the same Op.
17682   for (int i = 0; i < 4; ++i) {
17683     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17684     if (Widened128Mask[i] < 0)
17685       continue;
17686 
17687     SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17688     unsigned OpIndex = i / 2;
17689     if (Ops[OpIndex].isUndef())
17690       Ops[OpIndex] = Op;
17691     else if (Ops[OpIndex] != Op)
17692       return SDValue();
17693 
17694     // Convert the 128-bit shuffle mask selection values into 128-bit selection
17695     // bits defined by a vshuf64x2 instruction's immediate control byte.
17696     PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17697   }
17698 
17699   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17700                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
17701 }
17702 
17703 /// Handle lowering of 8-lane 64-bit floating point shuffles.
17704 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17705                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17706                                  const X86Subtarget &Subtarget,
17707                                  SelectionDAG &DAG) {
17708   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17709   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17710   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17711 
17712   if (V2.isUndef()) {
17713     // Use low duplicate instructions for masks that match their pattern.
17714     if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17715       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17716 
17717     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17718       // Non-half-crossing single input shuffles can be lowered with an
17719       // interleaved permutation.
17720       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17721                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17722                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17723                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17724       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17725                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17726     }
17727 
17728     SmallVector<int, 4> RepeatedMask;
17729     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17730       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17731                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17732   }
17733 
17734   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17735                                            V2, Subtarget, DAG))
17736     return Shuf128;
17737 
17738   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17739     return Unpck;
17740 
17741   // Check if the blend happens to exactly fit that of SHUFPD.
17742   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17743                                           Zeroable, Subtarget, DAG))
17744     return Op;
17745 
17746   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17747                                        DAG, Subtarget))
17748     return V;
17749 
17750   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17751                                           Zeroable, Subtarget, DAG))
17752     return Blend;
17753 
17754   return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17755 }
17756 
17757 /// Handle lowering of 16-lane 32-bit floating point shuffles.
17758 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17759                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17760                                   const X86Subtarget &Subtarget,
17761                                   SelectionDAG &DAG) {
17762   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17763   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17764   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17765 
17766   // If the shuffle mask is repeated in each 128-bit lane, we have many more
17767   // options to efficiently lower the shuffle.
17768   SmallVector<int, 4> RepeatedMask;
17769   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17770     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17771 
17772     // Use even/odd duplicate instructions for masks that match their pattern.
17773     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17774       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17775     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17776       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17777 
17778     if (V2.isUndef())
17779       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17780                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17781 
17782     // Use dedicated unpack instructions for masks that match their pattern.
17783     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17784       return V;
17785 
17786     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17787                                             Zeroable, Subtarget, DAG))
17788       return Blend;
17789 
17790     // Otherwise, fall back to a SHUFPS sequence.
17791     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17792   }
17793 
17794   // Try to create an in-lane repeating shuffle mask and then shuffle the
17795   // results into the target lanes.
17796   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17797           DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17798     return V;
17799 
17800   // If we have a single input shuffle with different shuffle patterns in the
17801   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17802   if (V2.isUndef() &&
17803       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17804     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17805     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17806   }
17807 
17808   // If we have AVX512F support, we can use VEXPAND.
17809   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17810                                              V1, V2, DAG, Subtarget))
17811     return V;
17812 
17813   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17814 }
17815 
17816 /// Handle lowering of 8-lane 64-bit integer shuffles.
17817 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17818                                  const APInt &Zeroable, SDValue V1, SDValue V2,
17819                                  const X86Subtarget &Subtarget,
17820                                  SelectionDAG &DAG) {
17821   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17822   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17823   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17824 
17825   if (V2.isUndef()) {
17826     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17827     // can use lower latency instructions that will operate on all four
17828     // 128-bit lanes.
17829     SmallVector<int, 2> Repeated128Mask;
17830     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17831       SmallVector<int, 4> PSHUFDMask;
17832       narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17833       return DAG.getBitcast(
17834           MVT::v8i64,
17835           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17836                       DAG.getBitcast(MVT::v16i32, V1),
17837                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17838     }
17839 
17840     SmallVector<int, 4> Repeated256Mask;
17841     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17842       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17843                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17844   }
17845 
17846   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17847                                            V2, Subtarget, DAG))
17848     return Shuf128;
17849 
17850   // Try to use shift instructions.
17851   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17852                                           Zeroable, Subtarget, DAG))
17853     return Shift;
17854 
17855   // Try to use VALIGN.
17856   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17857                                             Subtarget, DAG))
17858     return Rotate;
17859 
17860   // Try to use PALIGNR.
17861   if (Subtarget.hasBWI())
17862     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17863                                                   Subtarget, DAG))
17864       return Rotate;
17865 
17866   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17867     return Unpck;
17868 
17869   // If we have AVX512F support, we can use VEXPAND.
17870   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17871                                        DAG, Subtarget))
17872     return V;
17873 
17874   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17875                                           Zeroable, Subtarget, DAG))
17876     return Blend;
17877 
17878   return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17879 }
17880 
17881 /// Handle lowering of 16-lane 32-bit integer shuffles.
17882 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17883                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17884                                   const X86Subtarget &Subtarget,
17885                                   SelectionDAG &DAG) {
17886   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17887   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17888   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17889 
17890   // Whenever we can lower this as a zext, that instruction is strictly faster
17891   // than any alternative. It also allows us to fold memory operands into the
17892   // shuffle in many cases.
17893   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17894           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17895     return ZExt;
17896 
17897   // If the shuffle mask is repeated in each 128-bit lane we can use more
17898   // efficient instructions that mirror the shuffles across the four 128-bit
17899   // lanes.
17900   SmallVector<int, 4> RepeatedMask;
17901   bool Is128BitLaneRepeatedShuffle =
17902       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17903   if (Is128BitLaneRepeatedShuffle) {
17904     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17905     if (V2.isUndef())
17906       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17907                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17908 
17909     // Use dedicated unpack instructions for masks that match their pattern.
17910     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17911       return V;
17912   }
17913 
17914   // Try to use shift instructions.
17915   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17916                                           Zeroable, Subtarget, DAG))
17917     return Shift;
17918 
17919   // Try to use VALIGN.
17920   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17921                                             Subtarget, DAG))
17922     return Rotate;
17923 
17924   // Try to use byte rotation instructions.
17925   if (Subtarget.hasBWI())
17926     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17927                                                   Subtarget, DAG))
17928       return Rotate;
17929 
17930   // Assume that a single SHUFPS is faster than using a permv shuffle.
17931   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17932   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17933     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17934     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17935     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17936                                             CastV1, CastV2, DAG);
17937     return DAG.getBitcast(MVT::v16i32, ShufPS);
17938   }
17939 
17940   // Try to create an in-lane repeating shuffle mask and then shuffle the
17941   // results into the target lanes.
17942   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17943           DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17944     return V;
17945 
17946   // If we have AVX512F support, we can use VEXPAND.
17947   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17948                                        DAG, Subtarget))
17949     return V;
17950 
17951   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17952                                           Zeroable, Subtarget, DAG))
17953     return Blend;
17954 
17955   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17956 }
17957 
17958 /// Handle lowering of 32-lane 16-bit integer shuffles.
17959 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17960                                   const APInt &Zeroable, SDValue V1, SDValue V2,
17961                                   const X86Subtarget &Subtarget,
17962                                   SelectionDAG &DAG) {
17963   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17964   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17965   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17966   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17967 
17968   // Whenever we can lower this as a zext, that instruction is strictly faster
17969   // than any alternative. It also allows us to fold memory operands into the
17970   // shuffle in many cases.
17971   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17972           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17973     return ZExt;
17974 
17975   // Use dedicated unpack instructions for masks that match their pattern.
17976   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17977     return V;
17978 
17979   // Use dedicated pack instructions for masks that match their pattern.
17980   if (SDValue V =
17981           lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17982     return V;
17983 
17984   // Try to use shift instructions.
17985   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17986                                           Zeroable, Subtarget, DAG))
17987     return Shift;
17988 
17989   // Try to use byte rotation instructions.
17990   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17991                                                 Subtarget, DAG))
17992     return Rotate;
17993 
17994   if (V2.isUndef()) {
17995     // Try to use bit rotation instructions.
17996     if (SDValue Rotate =
17997             lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17998       return Rotate;
17999 
18000     SmallVector<int, 8> RepeatedMask;
18001     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18002       // As this is a single-input shuffle, the repeated mask should be
18003       // a strictly valid v8i16 mask that we can pass through to the v8i16
18004       // lowering to handle even the v32 case.
18005       return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18006                                                  RepeatedMask, Subtarget, DAG);
18007     }
18008   }
18009 
18010   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18011                                           Zeroable, Subtarget, DAG))
18012     return Blend;
18013 
18014   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18015                                               Zeroable, Subtarget, DAG))
18016     return PSHUFB;
18017 
18018   return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18019 }
18020 
18021 /// Handle lowering of 64-lane 8-bit integer shuffles.
18022 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18023                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18024                                  const X86Subtarget &Subtarget,
18025                                  SelectionDAG &DAG) {
18026   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18027   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18028   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
18029   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
18030 
18031   // Whenever we can lower this as a zext, that instruction is strictly faster
18032   // than any alternative. It also allows us to fold memory operands into the
18033   // shuffle in many cases.
18034   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18035           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18036     return ZExt;
18037 
18038   // Use dedicated unpack instructions for masks that match their pattern.
18039   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18040     return V;
18041 
18042   // Use dedicated pack instructions for masks that match their pattern.
18043   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18044                                        Subtarget))
18045     return V;
18046 
18047   // Try to use shift instructions.
18048   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18049                                           Zeroable, Subtarget, DAG))
18050     return Shift;
18051 
18052   // Try to use byte rotation instructions.
18053   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18054                                                 Subtarget, DAG))
18055     return Rotate;
18056 
18057   // Try to use bit rotation instructions.
18058   if (V2.isUndef())
18059     if (SDValue Rotate =
18060             lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18061       return Rotate;
18062 
18063   // Lower as AND if possible.
18064   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18065                                              Zeroable, Subtarget, DAG))
18066     return Masked;
18067 
18068   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18069                                               Zeroable, Subtarget, DAG))
18070     return PSHUFB;
18071 
18072   // VBMI can use VPERMV/VPERMV3 byte shuffles.
18073   if (Subtarget.hasVBMI())
18074     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18075 
18076   // Try to create an in-lane repeating shuffle mask and then shuffle the
18077   // results into the target lanes.
18078   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18079           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18080     return V;
18081 
18082   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18083                                           Zeroable, Subtarget, DAG))
18084     return Blend;
18085 
18086   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18087   // shuffle.
18088   if (!V2.isUndef())
18089     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18090             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18091       return Result;
18092 
18093   // FIXME: Implement direct support for this type!
18094   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18095 }
18096 
18097 /// High-level routine to lower various 512-bit x86 vector shuffles.
18098 ///
18099 /// This routine either breaks down the specific type of a 512-bit x86 vector
18100 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
18101 /// together based on the available instructions.
18102 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18103                                   MVT VT, SDValue V1, SDValue V2,
18104                                   const APInt &Zeroable,
18105                                   const X86Subtarget &Subtarget,
18106                                   SelectionDAG &DAG) {
18107   assert(Subtarget.hasAVX512() &&
18108          "Cannot lower 512-bit vectors w/ basic ISA!");
18109 
18110   // If we have a single input to the zero element, insert that into V1 if we
18111   // can do so cheaply.
18112   int NumElts = Mask.size();
18113   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18114 
18115   if (NumV2Elements == 1 && Mask[0] >= NumElts)
18116     if (SDValue Insertion = lowerShuffleAsElementInsertion(
18117             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18118       return Insertion;
18119 
18120   // Handle special cases where the lower or upper half is UNDEF.
18121   if (SDValue V =
18122           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18123     return V;
18124 
18125   // Check for being able to broadcast a single element.
18126   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18127                                                   Subtarget, DAG))
18128     return Broadcast;
18129 
18130   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18131     // Try using bit ops for masking and blending before falling back to
18132     // splitting.
18133     if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18134                                           Subtarget, DAG))
18135       return V;
18136     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18137       return V;
18138 
18139     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18140   }
18141 
18142   // Dispatch to each element type for lowering. If we don't have support for
18143   // specific element type shuffles at 512 bits, immediately split them and
18144   // lower them. Each lowering routine of a given type is allowed to assume that
18145   // the requisite ISA extensions for that element type are available.
18146   switch (VT.SimpleTy) {
18147   case MVT::v8f64:
18148     return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18149   case MVT::v16f32:
18150     return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18151   case MVT::v8i64:
18152     return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18153   case MVT::v16i32:
18154     return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18155   case MVT::v32i16:
18156     return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18157   case MVT::v64i8:
18158     return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18159 
18160   default:
18161     llvm_unreachable("Not a valid 512-bit x86 vector type!");
18162   }
18163 }
18164 
18165 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18166                                          MVT VT, SDValue V1, SDValue V2,
18167                                          const X86Subtarget &Subtarget,
18168                                          SelectionDAG &DAG) {
18169   // Shuffle should be unary.
18170   if (!V2.isUndef())
18171     return SDValue();
18172 
18173   int ShiftAmt = -1;
18174   int NumElts = Mask.size();
18175   for (int i = 0; i != NumElts; ++i) {
18176     int M = Mask[i];
18177     assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18178            "Unexpected mask index.");
18179     if (M < 0)
18180       continue;
18181 
18182     // The first non-undef element determines our shift amount.
18183     if (ShiftAmt < 0) {
18184       ShiftAmt = M - i;
18185       // Need to be shifting right.
18186       if (ShiftAmt <= 0)
18187         return SDValue();
18188     }
18189     // All non-undef elements must shift by the same amount.
18190     if (ShiftAmt != M - i)
18191       return SDValue();
18192   }
18193   assert(ShiftAmt >= 0 && "All undef?");
18194 
18195   // Great we found a shift right.
18196   MVT WideVT = VT;
18197   if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18198     WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18199   SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18200                             DAG.getUNDEF(WideVT), V1,
18201                             DAG.getIntPtrConstant(0, DL));
18202   Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18203                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18204   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18205                      DAG.getIntPtrConstant(0, DL));
18206 }
18207 
18208 // Determine if this shuffle can be implemented with a KSHIFT instruction.
18209 // Returns the shift amount if possible or -1 if not. This is a simplified
18210 // version of matchShuffleAsShift.
18211 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18212                                     int MaskOffset, const APInt &Zeroable) {
18213   int Size = Mask.size();
18214 
18215   auto CheckZeros = [&](int Shift, bool Left) {
18216     for (int j = 0; j < Shift; ++j)
18217       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18218         return false;
18219 
18220     return true;
18221   };
18222 
18223   auto MatchShift = [&](int Shift, bool Left) {
18224     unsigned Pos = Left ? Shift : 0;
18225     unsigned Low = Left ? 0 : Shift;
18226     unsigned Len = Size - Shift;
18227     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18228   };
18229 
18230   for (int Shift = 1; Shift != Size; ++Shift)
18231     for (bool Left : {true, false})
18232       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18233         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18234         return Shift;
18235       }
18236 
18237   return -1;
18238 }
18239 
18240 
18241 // Lower vXi1 vector shuffles.
18242 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
18243 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
18244 // vector, shuffle and then truncate it back.
18245 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18246                                 MVT VT, SDValue V1, SDValue V2,
18247                                 const APInt &Zeroable,
18248                                 const X86Subtarget &Subtarget,
18249                                 SelectionDAG &DAG) {
18250   assert(Subtarget.hasAVX512() &&
18251          "Cannot lower 512-bit vectors w/o basic ISA!");
18252 
18253   int NumElts = Mask.size();
18254 
18255   // Try to recognize shuffles that are just padding a subvector with zeros.
18256   int SubvecElts = 0;
18257   int Src = -1;
18258   for (int i = 0; i != NumElts; ++i) {
18259     if (Mask[i] >= 0) {
18260       // Grab the source from the first valid mask. All subsequent elements need
18261       // to use this same source.
18262       if (Src < 0)
18263         Src = Mask[i] / NumElts;
18264       if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18265         break;
18266     }
18267 
18268     ++SubvecElts;
18269   }
18270   assert(SubvecElts != NumElts && "Identity shuffle?");
18271 
18272   // Clip to a power 2.
18273   SubvecElts = PowerOf2Floor(SubvecElts);
18274 
18275   // Make sure the number of zeroable bits in the top at least covers the bits
18276   // not covered by the subvector.
18277   if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18278     assert(Src >= 0 && "Expected a source!");
18279     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18280     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18281                                   Src == 0 ? V1 : V2,
18282                                   DAG.getIntPtrConstant(0, DL));
18283     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18284                        DAG.getConstant(0, DL, VT),
18285                        Extract, DAG.getIntPtrConstant(0, DL));
18286   }
18287 
18288   // Try a simple shift right with undef elements. Later we'll try with zeros.
18289   if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18290                                                 DAG))
18291     return Shift;
18292 
18293   // Try to match KSHIFTs.
18294   unsigned Offset = 0;
18295   for (SDValue V : { V1, V2 }) {
18296     unsigned Opcode;
18297     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18298     if (ShiftAmt >= 0) {
18299       MVT WideVT = VT;
18300       if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18301         WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18302       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18303                                 DAG.getUNDEF(WideVT), V,
18304                                 DAG.getIntPtrConstant(0, DL));
18305       // Widened right shifts need two shifts to ensure we shift in zeroes.
18306       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18307         int WideElts = WideVT.getVectorNumElements();
18308         // Shift left to put the original vector in the MSBs of the new size.
18309         Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18310                           DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18311         // Increase the shift amount to account for the left shift.
18312         ShiftAmt += WideElts - NumElts;
18313       }
18314 
18315       Res = DAG.getNode(Opcode, DL, WideVT, Res,
18316                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18317       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18318                          DAG.getIntPtrConstant(0, DL));
18319     }
18320     Offset += NumElts; // Increment for next iteration.
18321   }
18322 
18323 
18324 
18325   MVT ExtVT;
18326   switch (VT.SimpleTy) {
18327   default:
18328     llvm_unreachable("Expected a vector of i1 elements");
18329   case MVT::v2i1:
18330     ExtVT = MVT::v2i64;
18331     break;
18332   case MVT::v4i1:
18333     ExtVT = MVT::v4i32;
18334     break;
18335   case MVT::v8i1:
18336     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18337     // shuffle.
18338     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18339     break;
18340   case MVT::v16i1:
18341     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18342     // 256-bit operation available.
18343     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18344     break;
18345   case MVT::v32i1:
18346     // Take 512-bit type, unless we are avoiding 512-bit types and have the
18347     // 256-bit operation available.
18348     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18349     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18350     break;
18351   case MVT::v64i1:
18352     // Fall back to scalarization. FIXME: We can do better if the shuffle
18353     // can be partitioned cleanly.
18354     if (!Subtarget.useBWIRegs())
18355       return SDValue();
18356     ExtVT = MVT::v64i8;
18357     break;
18358   }
18359 
18360   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18361   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18362 
18363   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18364   // i1 was sign extended we can use X86ISD::CVT2MASK.
18365   int NumElems = VT.getVectorNumElements();
18366   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18367       (Subtarget.hasDQI() && (NumElems < 32)))
18368     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18369                        Shuffle, ISD::SETGT);
18370 
18371   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18372 }
18373 
18374 /// Helper function that returns true if the shuffle mask should be
18375 /// commuted to improve canonicalization.
18376 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18377   int NumElements = Mask.size();
18378 
18379   int NumV1Elements = 0, NumV2Elements = 0;
18380   for (int M : Mask)
18381     if (M < 0)
18382       continue;
18383     else if (M < NumElements)
18384       ++NumV1Elements;
18385     else
18386       ++NumV2Elements;
18387 
18388   // Commute the shuffle as needed such that more elements come from V1 than
18389   // V2. This allows us to match the shuffle pattern strictly on how many
18390   // elements come from V1 without handling the symmetric cases.
18391   if (NumV2Elements > NumV1Elements)
18392     return true;
18393 
18394   assert(NumV1Elements > 0 && "No V1 indices");
18395 
18396   if (NumV2Elements == 0)
18397     return false;
18398 
18399   // When the number of V1 and V2 elements are the same, try to minimize the
18400   // number of uses of V2 in the low half of the vector. When that is tied,
18401   // ensure that the sum of indices for V1 is equal to or lower than the sum
18402   // indices for V2. When those are equal, try to ensure that the number of odd
18403   // indices for V1 is lower than the number of odd indices for V2.
18404   if (NumV1Elements == NumV2Elements) {
18405     int LowV1Elements = 0, LowV2Elements = 0;
18406     for (int M : Mask.slice(0, NumElements / 2))
18407       if (M >= NumElements)
18408         ++LowV2Elements;
18409       else if (M >= 0)
18410         ++LowV1Elements;
18411     if (LowV2Elements > LowV1Elements)
18412       return true;
18413     if (LowV2Elements == LowV1Elements) {
18414       int SumV1Indices = 0, SumV2Indices = 0;
18415       for (int i = 0, Size = Mask.size(); i < Size; ++i)
18416         if (Mask[i] >= NumElements)
18417           SumV2Indices += i;
18418         else if (Mask[i] >= 0)
18419           SumV1Indices += i;
18420       if (SumV2Indices < SumV1Indices)
18421         return true;
18422       if (SumV2Indices == SumV1Indices) {
18423         int NumV1OddIndices = 0, NumV2OddIndices = 0;
18424         for (int i = 0, Size = Mask.size(); i < Size; ++i)
18425           if (Mask[i] >= NumElements)
18426             NumV2OddIndices += i % 2;
18427           else if (Mask[i] >= 0)
18428             NumV1OddIndices += i % 2;
18429         if (NumV2OddIndices < NumV1OddIndices)
18430           return true;
18431       }
18432     }
18433   }
18434 
18435   return false;
18436 }
18437 
18438 /// Top-level lowering for x86 vector shuffles.
18439 ///
18440 /// This handles decomposition, canonicalization, and lowering of all x86
18441 /// vector shuffles. Most of the specific lowering strategies are encapsulated
18442 /// above in helper routines. The canonicalization attempts to widen shuffles
18443 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
18444 /// s.t. only one of the two inputs needs to be tested, etc.
18445 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18446                                    SelectionDAG &DAG) {
18447   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18448   ArrayRef<int> OrigMask = SVOp->getMask();
18449   SDValue V1 = Op.getOperand(0);
18450   SDValue V2 = Op.getOperand(1);
18451   MVT VT = Op.getSimpleValueType();
18452   int NumElements = VT.getVectorNumElements();
18453   SDLoc DL(Op);
18454   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18455 
18456   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18457          "Can't lower MMX shuffles");
18458 
18459   bool V1IsUndef = V1.isUndef();
18460   bool V2IsUndef = V2.isUndef();
18461   if (V1IsUndef && V2IsUndef)
18462     return DAG.getUNDEF(VT);
18463 
18464   // When we create a shuffle node we put the UNDEF node to second operand,
18465   // but in some cases the first operand may be transformed to UNDEF.
18466   // In this case we should just commute the node.
18467   if (V1IsUndef)
18468     return DAG.getCommutedVectorShuffle(*SVOp);
18469 
18470   // Check for non-undef masks pointing at an undef vector and make the masks
18471   // undef as well. This makes it easier to match the shuffle based solely on
18472   // the mask.
18473   if (V2IsUndef &&
18474       any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18475     SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18476     for (int &M : NewMask)
18477       if (M >= NumElements)
18478         M = -1;
18479     return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18480   }
18481 
18482   // Check for illegal shuffle mask element index values.
18483   int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18484   (void)MaskUpperLimit;
18485   assert(llvm::all_of(OrigMask,
18486                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18487          "Out of bounds shuffle index");
18488 
18489   // We actually see shuffles that are entirely re-arrangements of a set of
18490   // zero inputs. This mostly happens while decomposing complex shuffles into
18491   // simple ones. Directly lower these as a buildvector of zeros.
18492   APInt KnownUndef, KnownZero;
18493   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18494 
18495   APInt Zeroable = KnownUndef | KnownZero;
18496   if (Zeroable.isAllOnesValue())
18497     return getZeroVector(VT, Subtarget, DAG, DL);
18498 
18499   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18500 
18501   // Try to collapse shuffles into using a vector type with fewer elements but
18502   // wider element types. We cap this to not form integers or floating point
18503   // elements wider than 64 bits. It does not seem beneficial to form i128
18504   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18505   SmallVector<int, 16> WidenedMask;
18506   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18507       canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18508     // Shuffle mask widening should not interfere with a broadcast opportunity
18509     // by obfuscating the operands with bitcasts.
18510     // TODO: Avoid lowering directly from this top-level function: make this
18511     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18512     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18513                                                     Subtarget, DAG))
18514       return Broadcast;
18515 
18516     MVT NewEltVT = VT.isFloatingPoint()
18517                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18518                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18519     int NewNumElts = NumElements / 2;
18520     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18521     // Make sure that the new vector type is legal. For example, v2f64 isn't
18522     // legal on SSE1.
18523     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18524       if (V2IsZero) {
18525         // Modify the new Mask to take all zeros from the all-zero vector.
18526         // Choose indices that are blend-friendly.
18527         bool UsedZeroVector = false;
18528         assert(is_contained(WidenedMask, SM_SentinelZero) &&
18529                "V2's non-undef elements are used?!");
18530         for (int i = 0; i != NewNumElts; ++i)
18531           if (WidenedMask[i] == SM_SentinelZero) {
18532             WidenedMask[i] = i + NewNumElts;
18533             UsedZeroVector = true;
18534           }
18535         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18536         // some elements to be undef.
18537         if (UsedZeroVector)
18538           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18539       }
18540       V1 = DAG.getBitcast(NewVT, V1);
18541       V2 = DAG.getBitcast(NewVT, V2);
18542       return DAG.getBitcast(
18543           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18544     }
18545   }
18546 
18547   // Commute the shuffle if it will improve canonicalization.
18548   SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18549   if (canonicalizeShuffleMaskWithCommute(Mask)) {
18550     ShuffleVectorSDNode::commuteMask(Mask);
18551     std::swap(V1, V2);
18552   }
18553 
18554   // For each vector width, delegate to a specialized lowering routine.
18555   if (VT.is128BitVector())
18556     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18557 
18558   if (VT.is256BitVector())
18559     return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18560 
18561   if (VT.is512BitVector())
18562     return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18563 
18564   if (Is1BitVector)
18565     return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18566 
18567   llvm_unreachable("Unimplemented!");
18568 }
18569 
18570 /// Try to lower a VSELECT instruction to a vector shuffle.
18571 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18572                                            const X86Subtarget &Subtarget,
18573                                            SelectionDAG &DAG) {
18574   SDValue Cond = Op.getOperand(0);
18575   SDValue LHS = Op.getOperand(1);
18576   SDValue RHS = Op.getOperand(2);
18577   MVT VT = Op.getSimpleValueType();
18578 
18579   // Only non-legal VSELECTs reach this lowering, convert those into generic
18580   // shuffles and re-use the shuffle lowering path for blends.
18581   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18582     SmallVector<int, 32> Mask;
18583     if (createShuffleMaskFromVSELECT(Mask, Cond))
18584       return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18585   }
18586 
18587   return SDValue();
18588 }
18589 
18590 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18591   SDValue Cond = Op.getOperand(0);
18592   SDValue LHS = Op.getOperand(1);
18593   SDValue RHS = Op.getOperand(2);
18594 
18595   // A vselect where all conditions and data are constants can be optimized into
18596   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18597   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18598       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18599       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18600     return SDValue();
18601 
18602   // Try to lower this to a blend-style vector shuffle. This can handle all
18603   // constant condition cases.
18604   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18605     return BlendOp;
18606 
18607   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18608   // with patterns on the mask registers on AVX-512.
18609   MVT CondVT = Cond.getSimpleValueType();
18610   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18611   if (CondEltSize == 1)
18612     return Op;
18613 
18614   // Variable blends are only legal from SSE4.1 onward.
18615   if (!Subtarget.hasSSE41())
18616     return SDValue();
18617 
18618   SDLoc dl(Op);
18619   MVT VT = Op.getSimpleValueType();
18620   unsigned EltSize = VT.getScalarSizeInBits();
18621   unsigned NumElts = VT.getVectorNumElements();
18622 
18623   // Expand v32i16/v64i8 without BWI.
18624   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18625     return SDValue();
18626 
18627   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18628   // into an i1 condition so that we can use the mask-based 512-bit blend
18629   // instructions.
18630   if (VT.getSizeInBits() == 512) {
18631     // Build a mask by testing the condition against zero.
18632     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18633     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18634                                 DAG.getConstant(0, dl, CondVT),
18635                                 ISD::SETNE);
18636     // Now return a new VSELECT using the mask.
18637     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18638   }
18639 
18640   // SEXT/TRUNC cases where the mask doesn't match the destination size.
18641   if (CondEltSize != EltSize) {
18642     // If we don't have a sign splat, rely on the expansion.
18643     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18644       return SDValue();
18645 
18646     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18647     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18648     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18649     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18650   }
18651 
18652   // Only some types will be legal on some subtargets. If we can emit a legal
18653   // VSELECT-matching blend, return Op, and but if we need to expand, return
18654   // a null value.
18655   switch (VT.SimpleTy) {
18656   default:
18657     // Most of the vector types have blends past SSE4.1.
18658     return Op;
18659 
18660   case MVT::v32i8:
18661     // The byte blends for AVX vectors were introduced only in AVX2.
18662     if (Subtarget.hasAVX2())
18663       return Op;
18664 
18665     return SDValue();
18666 
18667   case MVT::v8i16:
18668   case MVT::v16i16: {
18669     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18670     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18671     Cond = DAG.getBitcast(CastVT, Cond);
18672     LHS = DAG.getBitcast(CastVT, LHS);
18673     RHS = DAG.getBitcast(CastVT, RHS);
18674     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18675     return DAG.getBitcast(VT, Select);
18676   }
18677   }
18678 }
18679 
18680 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18681   MVT VT = Op.getSimpleValueType();
18682   SDValue Vec = Op.getOperand(0);
18683   SDValue Idx = Op.getOperand(1);
18684   assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18685   SDLoc dl(Op);
18686 
18687   if (!Vec.getSimpleValueType().is128BitVector())
18688     return SDValue();
18689 
18690   if (VT.getSizeInBits() == 8) {
18691     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18692     // we're going to zero extend the register or fold the store.
18693     if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18694         !MayFoldIntoStore(Op))
18695       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18696                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18697                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
18698 
18699     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18700     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18701                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18702     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18703   }
18704 
18705   if (VT == MVT::f32) {
18706     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18707     // the result back to FR32 register. It's only worth matching if the
18708     // result has a single use which is a store or a bitcast to i32.  And in
18709     // the case of a store, it's not worth it if the index is a constant 0,
18710     // because a MOVSSmr can be used instead, which is smaller and faster.
18711     if (!Op.hasOneUse())
18712       return SDValue();
18713     SDNode *User = *Op.getNode()->use_begin();
18714     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18715         (User->getOpcode() != ISD::BITCAST ||
18716          User->getValueType(0) != MVT::i32))
18717       return SDValue();
18718     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18719                                   DAG.getBitcast(MVT::v4i32, Vec), Idx);
18720     return DAG.getBitcast(MVT::f32, Extract);
18721   }
18722 
18723   if (VT == MVT::i32 || VT == MVT::i64)
18724       return Op;
18725 
18726   return SDValue();
18727 }
18728 
18729 /// Extract one bit from mask vector, like v16i1 or v8i1.
18730 /// AVX-512 feature.
18731 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18732                                         const X86Subtarget &Subtarget) {
18733   SDValue Vec = Op.getOperand(0);
18734   SDLoc dl(Vec);
18735   MVT VecVT = Vec.getSimpleValueType();
18736   SDValue Idx = Op.getOperand(1);
18737   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18738   MVT EltVT = Op.getSimpleValueType();
18739 
18740   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18741          "Unexpected vector type in ExtractBitFromMaskVector");
18742 
18743   // variable index can't be handled in mask registers,
18744   // extend vector to VR512/128
18745   if (!IdxC) {
18746     unsigned NumElts = VecVT.getVectorNumElements();
18747     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18748     // than extending to 128/256bit.
18749     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18750     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18751     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18752     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18753     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18754   }
18755 
18756   unsigned IdxVal = IdxC->getZExtValue();
18757   if (IdxVal == 0) // the operation is legal
18758     return Op;
18759 
18760   // Extend to natively supported kshift.
18761   unsigned NumElems = VecVT.getVectorNumElements();
18762   MVT WideVecVT = VecVT;
18763   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18764     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18765     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18766                       DAG.getUNDEF(WideVecVT), Vec,
18767                       DAG.getIntPtrConstant(0, dl));
18768   }
18769 
18770   // Use kshiftr instruction to move to the lower element.
18771   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18772                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18773 
18774   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18775                      DAG.getIntPtrConstant(0, dl));
18776 }
18777 
18778 SDValue
18779 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18780                                            SelectionDAG &DAG) const {
18781   SDLoc dl(Op);
18782   SDValue Vec = Op.getOperand(0);
18783   MVT VecVT = Vec.getSimpleValueType();
18784   SDValue Idx = Op.getOperand(1);
18785   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18786 
18787   if (VecVT.getVectorElementType() == MVT::i1)
18788     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18789 
18790   if (!IdxC) {
18791     // Its more profitable to go through memory (1 cycles throughput)
18792     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18793     // IACA tool was used to get performance estimation
18794     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18795     //
18796     // example : extractelement <16 x i8> %a, i32 %i
18797     //
18798     // Block Throughput: 3.00 Cycles
18799     // Throughput Bottleneck: Port5
18800     //
18801     // | Num Of |   Ports pressure in cycles  |    |
18802     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
18803     // ---------------------------------------------
18804     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
18805     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
18806     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
18807     // Total Num Of Uops: 4
18808     //
18809     //
18810     // Block Throughput: 1.00 Cycles
18811     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18812     //
18813     // |    |  Ports pressure in cycles   |  |
18814     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
18815     // ---------------------------------------------------------
18816     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18817     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
18818     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
18819     // Total Num Of Uops: 4
18820 
18821     return SDValue();
18822   }
18823 
18824   unsigned IdxVal = IdxC->getZExtValue();
18825 
18826   // If this is a 256-bit vector result, first extract the 128-bit vector and
18827   // then extract the element from the 128-bit vector.
18828   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18829     // Get the 128-bit vector.
18830     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18831     MVT EltVT = VecVT.getVectorElementType();
18832 
18833     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18834     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18835 
18836     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18837     // this can be done with a mask.
18838     IdxVal &= ElemsPerChunk - 1;
18839     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18840                        DAG.getIntPtrConstant(IdxVal, dl));
18841   }
18842 
18843   assert(VecVT.is128BitVector() && "Unexpected vector length");
18844 
18845   MVT VT = Op.getSimpleValueType();
18846 
18847   if (VT.getSizeInBits() == 16) {
18848     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18849     // we're going to zero extend the register or fold the store (SSE41 only).
18850     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18851         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18852       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18853                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18854                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
18855 
18856     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18857                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18858     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18859   }
18860 
18861   if (Subtarget.hasSSE41())
18862     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18863       return Res;
18864 
18865   // TODO: We only extract a single element from v16i8, we can probably afford
18866   // to be more aggressive here before using the default approach of spilling to
18867   // stack.
18868   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18869     // Extract either the lowest i32 or any i16, and extract the sub-byte.
18870     int DWordIdx = IdxVal / 4;
18871     if (DWordIdx == 0) {
18872       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18873                                 DAG.getBitcast(MVT::v4i32, Vec),
18874                                 DAG.getIntPtrConstant(DWordIdx, dl));
18875       int ShiftVal = (IdxVal % 4) * 8;
18876       if (ShiftVal != 0)
18877         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18878                           DAG.getConstant(ShiftVal, dl, MVT::i8));
18879       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18880     }
18881 
18882     int WordIdx = IdxVal / 2;
18883     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18884                               DAG.getBitcast(MVT::v8i16, Vec),
18885                               DAG.getIntPtrConstant(WordIdx, dl));
18886     int ShiftVal = (IdxVal % 2) * 8;
18887     if (ShiftVal != 0)
18888       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18889                         DAG.getConstant(ShiftVal, dl, MVT::i8));
18890     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18891   }
18892 
18893   if (VT.getSizeInBits() == 32) {
18894     if (IdxVal == 0)
18895       return Op;
18896 
18897     // SHUFPS the element to the lowest double word, then movss.
18898     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18899     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18900     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18901                        DAG.getIntPtrConstant(0, dl));
18902   }
18903 
18904   if (VT.getSizeInBits() == 64) {
18905     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18906     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18907     //        to match extract_elt for f64.
18908     if (IdxVal == 0)
18909       return Op;
18910 
18911     // UNPCKHPD the element to the lowest double word, then movsd.
18912     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18913     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18914     int Mask[2] = { 1, -1 };
18915     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18916     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18917                        DAG.getIntPtrConstant(0, dl));
18918   }
18919 
18920   return SDValue();
18921 }
18922 
18923 /// Insert one bit to mask vector, like v16i1 or v8i1.
18924 /// AVX-512 feature.
18925 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18926                                      const X86Subtarget &Subtarget) {
18927   SDLoc dl(Op);
18928   SDValue Vec = Op.getOperand(0);
18929   SDValue Elt = Op.getOperand(1);
18930   SDValue Idx = Op.getOperand(2);
18931   MVT VecVT = Vec.getSimpleValueType();
18932 
18933   if (!isa<ConstantSDNode>(Idx)) {
18934     // Non constant index. Extend source and destination,
18935     // insert element and then truncate the result.
18936     unsigned NumElts = VecVT.getVectorNumElements();
18937     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18938     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18939     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18940       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18941       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18942     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18943   }
18944 
18945   // Copy into a k-register, extract to v1i1 and insert_subvector.
18946   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18947   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18948 }
18949 
18950 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18951                                                   SelectionDAG &DAG) const {
18952   MVT VT = Op.getSimpleValueType();
18953   MVT EltVT = VT.getVectorElementType();
18954   unsigned NumElts = VT.getVectorNumElements();
18955   unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18956 
18957   if (EltVT == MVT::i1)
18958     return InsertBitToMaskVector(Op, DAG, Subtarget);
18959 
18960   SDLoc dl(Op);
18961   SDValue N0 = Op.getOperand(0);
18962   SDValue N1 = Op.getOperand(1);
18963   SDValue N2 = Op.getOperand(2);
18964   auto *N2C = dyn_cast<ConstantSDNode>(N2);
18965 
18966   if (!N2C) {
18967     // Variable insertion indices, usually we're better off spilling to stack,
18968     // but AVX512 can use a variable compare+select by comparing against all
18969     // possible vector indices, and FP insertion has less gpr->simd traffic.
18970     if (!(Subtarget.hasBWI() ||
18971           (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18972           (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18973       return SDValue();
18974 
18975     MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18976     MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18977     if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18978       return SDValue();
18979 
18980     SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18981     SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18982     SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18983 
18984     SmallVector<SDValue, 16> RawIndices;
18985     for (unsigned I = 0; I != NumElts; ++I)
18986       RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18987     SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18988 
18989     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18990     return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18991                            ISD::CondCode::SETEQ);
18992   }
18993 
18994   if (N2C->getAPIntValue().uge(NumElts))
18995     return SDValue();
18996   uint64_t IdxVal = N2C->getZExtValue();
18997 
18998   bool IsZeroElt = X86::isZeroNode(N1);
18999   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19000 
19001   // If we are inserting a element, see if we can do this more efficiently with
19002   // a blend shuffle with a rematerializable vector than a costly integer
19003   // insertion.
19004   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19005       (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19006     SmallVector<int, 8> BlendMask;
19007     for (unsigned i = 0; i != NumElts; ++i)
19008       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19009     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19010                                   : getOnesVector(VT, DAG, dl);
19011     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19012   }
19013 
19014   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19015   // into that, and then insert the subvector back into the result.
19016   if (VT.is256BitVector() || VT.is512BitVector()) {
19017     // With a 256-bit vector, we can insert into the zero element efficiently
19018     // using a blend if we have AVX or AVX2 and the right data type.
19019     if (VT.is256BitVector() && IdxVal == 0) {
19020       // TODO: It is worthwhile to cast integer to floating point and back
19021       // and incur a domain crossing penalty if that's what we'll end up
19022       // doing anyway after extracting to a 128-bit vector.
19023       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19024           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19025         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19026         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19027                            DAG.getTargetConstant(1, dl, MVT::i8));
19028       }
19029     }
19030 
19031     // Get the desired 128-bit vector chunk.
19032     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19033 
19034     // Insert the element into the desired chunk.
19035     unsigned NumEltsIn128 = 128 / EltSizeInBits;
19036     assert(isPowerOf2_32(NumEltsIn128));
19037     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19038     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19039 
19040     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19041                     DAG.getIntPtrConstant(IdxIn128, dl));
19042 
19043     // Insert the changed part back into the bigger vector
19044     return insert128BitVector(N0, V, IdxVal, DAG, dl);
19045   }
19046   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19047 
19048   // This will be just movd/movq/movss/movsd.
19049   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19050     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19051         EltVT == MVT::i64) {
19052       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19053       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19054     }
19055 
19056     // We can't directly insert an i8 or i16 into a vector, so zero extend
19057     // it to i32 first.
19058     if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19059       N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19060       MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19061       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19062       N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19063       return DAG.getBitcast(VT, N1);
19064     }
19065   }
19066 
19067   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19068   // argument. SSE41 required for pinsrb.
19069   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19070     unsigned Opc;
19071     if (VT == MVT::v8i16) {
19072       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19073       Opc = X86ISD::PINSRW;
19074     } else {
19075       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19076       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19077       Opc = X86ISD::PINSRB;
19078     }
19079 
19080     assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19081     N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19082     N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19083     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19084   }
19085 
19086   if (Subtarget.hasSSE41()) {
19087     if (EltVT == MVT::f32) {
19088       // Bits [7:6] of the constant are the source select. This will always be
19089       //   zero here. The DAG Combiner may combine an extract_elt index into
19090       //   these bits. For example (insert (extract, 3), 2) could be matched by
19091       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19092       // Bits [5:4] of the constant are the destination select. This is the
19093       //   value of the incoming immediate.
19094       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19095       //   combine either bitwise AND or insert of float 0.0 to set these bits.
19096 
19097       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19098       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19099         // If this is an insertion of 32-bits into the low 32-bits of
19100         // a vector, we prefer to generate a blend with immediate rather
19101         // than an insertps. Blends are simpler operations in hardware and so
19102         // will always have equal or better performance than insertps.
19103         // But if optimizing for size and there's a load folding opportunity,
19104         // generate insertps because blendps does not have a 32-bit memory
19105         // operand form.
19106         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19107         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19108                            DAG.getTargetConstant(1, dl, MVT::i8));
19109       }
19110       // Create this as a scalar to vector..
19111       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19112       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19113                          DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19114     }
19115 
19116     // PINSR* works with constant index.
19117     if (EltVT == MVT::i32 || EltVT == MVT::i64)
19118       return Op;
19119   }
19120 
19121   return SDValue();
19122 }
19123 
19124 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19125                                      SelectionDAG &DAG) {
19126   SDLoc dl(Op);
19127   MVT OpVT = Op.getSimpleValueType();
19128 
19129   // It's always cheaper to replace a xor+movd with xorps and simplifies further
19130   // combines.
19131   if (X86::isZeroNode(Op.getOperand(0)))
19132     return getZeroVector(OpVT, Subtarget, DAG, dl);
19133 
19134   // If this is a 256-bit vector result, first insert into a 128-bit
19135   // vector and then insert into the 256-bit vector.
19136   if (!OpVT.is128BitVector()) {
19137     // Insert into a 128-bit vector.
19138     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19139     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19140                                  OpVT.getVectorNumElements() / SizeFactor);
19141 
19142     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19143 
19144     // Insert the 128-bit vector.
19145     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19146   }
19147   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19148          "Expected an SSE type!");
19149 
19150   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19151   if (OpVT == MVT::v4i32)
19152     return Op;
19153 
19154   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19155   return DAG.getBitcast(
19156       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19157 }
19158 
19159 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
19160 // simple superregister reference or explicit instructions to insert
19161 // the upper bits of a vector.
19162 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19163                                      SelectionDAG &DAG) {
19164   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19165 
19166   return insert1BitVector(Op, DAG, Subtarget);
19167 }
19168 
19169 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19170                                       SelectionDAG &DAG) {
19171   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19172          "Only vXi1 extract_subvectors need custom lowering");
19173 
19174   SDLoc dl(Op);
19175   SDValue Vec = Op.getOperand(0);
19176   uint64_t IdxVal = Op.getConstantOperandVal(1);
19177 
19178   if (IdxVal == 0) // the operation is legal
19179     return Op;
19180 
19181   MVT VecVT = Vec.getSimpleValueType();
19182   unsigned NumElems = VecVT.getVectorNumElements();
19183 
19184   // Extend to natively supported kshift.
19185   MVT WideVecVT = VecVT;
19186   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19187     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19188     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19189                       DAG.getUNDEF(WideVecVT), Vec,
19190                       DAG.getIntPtrConstant(0, dl));
19191   }
19192 
19193   // Shift to the LSB.
19194   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19195                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19196 
19197   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19198                      DAG.getIntPtrConstant(0, dl));
19199 }
19200 
19201 // Returns the appropriate wrapper opcode for a global reference.
19202 unsigned X86TargetLowering::getGlobalWrapperKind(
19203     const GlobalValue *GV, const unsigned char OpFlags) const {
19204   // References to absolute symbols are never PC-relative.
19205   if (GV && GV->isAbsoluteSymbolRef())
19206     return X86ISD::Wrapper;
19207 
19208   CodeModel::Model M = getTargetMachine().getCodeModel();
19209   if (Subtarget.isPICStyleRIPRel() &&
19210       (M == CodeModel::Small || M == CodeModel::Kernel))
19211     return X86ISD::WrapperRIP;
19212 
19213   // GOTPCREL references must always use RIP.
19214   if (OpFlags == X86II::MO_GOTPCREL)
19215     return X86ISD::WrapperRIP;
19216 
19217   return X86ISD::Wrapper;
19218 }
19219 
19220 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19221 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19222 // one of the above mentioned nodes. It has to be wrapped because otherwise
19223 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19224 // be used to form addressing mode. These wrapped nodes will be selected
19225 // into MOV32ri.
19226 SDValue
19227 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19228   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19229 
19230   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19231   // global base reg.
19232   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19233 
19234   auto PtrVT = getPointerTy(DAG.getDataLayout());
19235   SDValue Result = DAG.getTargetConstantPool(
19236       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19237   SDLoc DL(CP);
19238   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19239   // With PIC, the address is actually $g + Offset.
19240   if (OpFlag) {
19241     Result =
19242         DAG.getNode(ISD::ADD, DL, PtrVT,
19243                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19244   }
19245 
19246   return Result;
19247 }
19248 
19249 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19250   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19251 
19252   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19253   // global base reg.
19254   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19255 
19256   auto PtrVT = getPointerTy(DAG.getDataLayout());
19257   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19258   SDLoc DL(JT);
19259   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19260 
19261   // With PIC, the address is actually $g + Offset.
19262   if (OpFlag)
19263     Result =
19264         DAG.getNode(ISD::ADD, DL, PtrVT,
19265                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19266 
19267   return Result;
19268 }
19269 
19270 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19271                                                SelectionDAG &DAG) const {
19272   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19273 }
19274 
19275 SDValue
19276 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19277   // Create the TargetBlockAddressAddress node.
19278   unsigned char OpFlags =
19279     Subtarget.classifyBlockAddressReference();
19280   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19281   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19282   SDLoc dl(Op);
19283   auto PtrVT = getPointerTy(DAG.getDataLayout());
19284   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19285   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19286 
19287   // With PIC, the address is actually $g + Offset.
19288   if (isGlobalRelativeToPICBase(OpFlags)) {
19289     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19290                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19291   }
19292 
19293   return Result;
19294 }
19295 
19296 /// Creates target global address or external symbol nodes for calls or
19297 /// other uses.
19298 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19299                                                  bool ForCall) const {
19300   // Unpack the global address or external symbol.
19301   const SDLoc &dl = SDLoc(Op);
19302   const GlobalValue *GV = nullptr;
19303   int64_t Offset = 0;
19304   const char *ExternalSym = nullptr;
19305   if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19306     GV = G->getGlobal();
19307     Offset = G->getOffset();
19308   } else {
19309     const auto *ES = cast<ExternalSymbolSDNode>(Op);
19310     ExternalSym = ES->getSymbol();
19311   }
19312 
19313   // Calculate some flags for address lowering.
19314   const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19315   unsigned char OpFlags;
19316   if (ForCall)
19317     OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19318   else
19319     OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19320   bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19321   bool NeedsLoad = isGlobalStubReference(OpFlags);
19322 
19323   CodeModel::Model M = DAG.getTarget().getCodeModel();
19324   auto PtrVT = getPointerTy(DAG.getDataLayout());
19325   SDValue Result;
19326 
19327   if (GV) {
19328     // Create a target global address if this is a global. If possible, fold the
19329     // offset into the global address reference. Otherwise, ADD it on later.
19330     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19331     // allowed because if the address of foo is 0, the ELF R_X86_64_32
19332     // relocation will compute to a negative value, which is invalid.
19333     int64_t GlobalOffset = 0;
19334     if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19335         X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19336       std::swap(GlobalOffset, Offset);
19337     }
19338     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19339   } else {
19340     // If this is not a global address, this must be an external symbol.
19341     Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19342   }
19343 
19344   // If this is a direct call, avoid the wrapper if we don't need to do any
19345   // loads or adds. This allows SDAG ISel to match direct calls.
19346   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19347     return Result;
19348 
19349   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19350 
19351   // With PIC, the address is actually $g + Offset.
19352   if (HasPICReg) {
19353     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19354                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19355   }
19356 
19357   // For globals that require a load from a stub to get the address, emit the
19358   // load.
19359   if (NeedsLoad)
19360     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19361                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19362 
19363   // If there was a non-zero offset that we didn't fold, create an explicit
19364   // addition for it.
19365   if (Offset != 0)
19366     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19367                          DAG.getConstant(Offset, dl, PtrVT));
19368 
19369   return Result;
19370 }
19371 
19372 SDValue
19373 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19374   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19375 }
19376 
19377 static SDValue
19378 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19379            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19380            unsigned char OperandFlags, bool LocalDynamic = false) {
19381   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19382   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19383   SDLoc dl(GA);
19384   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19385                                            GA->getValueType(0),
19386                                            GA->getOffset(),
19387                                            OperandFlags);
19388 
19389   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19390                                            : X86ISD::TLSADDR;
19391 
19392   if (InFlag) {
19393     SDValue Ops[] = { Chain,  TGA, *InFlag };
19394     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19395   } else {
19396     SDValue Ops[]  = { Chain, TGA };
19397     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19398   }
19399 
19400   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19401   MFI.setAdjustsStack(true);
19402   MFI.setHasCalls(true);
19403 
19404   SDValue Flag = Chain.getValue(1);
19405   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19406 }
19407 
19408 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19409 static SDValue
19410 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19411                                 const EVT PtrVT) {
19412   SDValue InFlag;
19413   SDLoc dl(GA);  // ? function entry point might be better
19414   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19415                                    DAG.getNode(X86ISD::GlobalBaseReg,
19416                                                SDLoc(), PtrVT), InFlag);
19417   InFlag = Chain.getValue(1);
19418 
19419   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19420 }
19421 
19422 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19423 static SDValue
19424 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19425                                 const EVT PtrVT) {
19426   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19427                     X86::RAX, X86II::MO_TLSGD);
19428 }
19429 
19430 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19431 static SDValue
19432 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19433                                  const EVT PtrVT) {
19434   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19435                     X86::EAX, X86II::MO_TLSGD);
19436 }
19437 
19438 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19439                                            SelectionDAG &DAG, const EVT PtrVT,
19440                                            bool Is64Bit, bool Is64BitLP64) {
19441   SDLoc dl(GA);
19442 
19443   // Get the start address of the TLS block for this module.
19444   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19445       .getInfo<X86MachineFunctionInfo>();
19446   MFI->incNumLocalDynamicTLSAccesses();
19447 
19448   SDValue Base;
19449   if (Is64Bit) {
19450     unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19451     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19452                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
19453   } else {
19454     SDValue InFlag;
19455     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19456         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19457     InFlag = Chain.getValue(1);
19458     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19459                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19460   }
19461 
19462   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19463   // of Base.
19464 
19465   // Build x@dtpoff.
19466   unsigned char OperandFlags = X86II::MO_DTPOFF;
19467   unsigned WrapperKind = X86ISD::Wrapper;
19468   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19469                                            GA->getValueType(0),
19470                                            GA->getOffset(), OperandFlags);
19471   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19472 
19473   // Add x@dtpoff with the base.
19474   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19475 }
19476 
19477 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19478 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19479                                    const EVT PtrVT, TLSModel::Model model,
19480                                    bool is64Bit, bool isPIC) {
19481   SDLoc dl(GA);
19482 
19483   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19484   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19485                                                          is64Bit ? 257 : 256));
19486 
19487   SDValue ThreadPointer =
19488       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19489                   MachinePointerInfo(Ptr));
19490 
19491   unsigned char OperandFlags = 0;
19492   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
19493   // initialexec.
19494   unsigned WrapperKind = X86ISD::Wrapper;
19495   if (model == TLSModel::LocalExec) {
19496     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19497   } else if (model == TLSModel::InitialExec) {
19498     if (is64Bit) {
19499       OperandFlags = X86II::MO_GOTTPOFF;
19500       WrapperKind = X86ISD::WrapperRIP;
19501     } else {
19502       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19503     }
19504   } else {
19505     llvm_unreachable("Unexpected model");
19506   }
19507 
19508   // emit "addl x@ntpoff,%eax" (local exec)
19509   // or "addl x@indntpoff,%eax" (initial exec)
19510   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19511   SDValue TGA =
19512       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19513                                  GA->getOffset(), OperandFlags);
19514   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19515 
19516   if (model == TLSModel::InitialExec) {
19517     if (isPIC && !is64Bit) {
19518       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19519                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19520                            Offset);
19521     }
19522 
19523     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19524                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19525   }
19526 
19527   // The address of the thread local variable is the add of the thread
19528   // pointer with the offset of the variable.
19529   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19530 }
19531 
19532 SDValue
19533 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19534 
19535   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19536 
19537   if (DAG.getTarget().useEmulatedTLS())
19538     return LowerToTLSEmulatedModel(GA, DAG);
19539 
19540   const GlobalValue *GV = GA->getGlobal();
19541   auto PtrVT = getPointerTy(DAG.getDataLayout());
19542   bool PositionIndependent = isPositionIndependent();
19543 
19544   if (Subtarget.isTargetELF()) {
19545     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19546     switch (model) {
19547       case TLSModel::GeneralDynamic:
19548         if (Subtarget.is64Bit()) {
19549           if (Subtarget.isTarget64BitLP64())
19550             return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19551           return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19552         }
19553         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19554       case TLSModel::LocalDynamic:
19555         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19556                                            Subtarget.isTarget64BitLP64());
19557       case TLSModel::InitialExec:
19558       case TLSModel::LocalExec:
19559         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19560                                    PositionIndependent);
19561     }
19562     llvm_unreachable("Unknown TLS model.");
19563   }
19564 
19565   if (Subtarget.isTargetDarwin()) {
19566     // Darwin only has one model of TLS.  Lower to that.
19567     unsigned char OpFlag = 0;
19568     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19569                            X86ISD::WrapperRIP : X86ISD::Wrapper;
19570 
19571     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19572     // global base reg.
19573     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19574     if (PIC32)
19575       OpFlag = X86II::MO_TLVP_PIC_BASE;
19576     else
19577       OpFlag = X86II::MO_TLVP;
19578     SDLoc DL(Op);
19579     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19580                                                 GA->getValueType(0),
19581                                                 GA->getOffset(), OpFlag);
19582     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19583 
19584     // With PIC32, the address is actually $g + Offset.
19585     if (PIC32)
19586       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19587                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19588                            Offset);
19589 
19590     // Lowering the machine isd will make sure everything is in the right
19591     // location.
19592     SDValue Chain = DAG.getEntryNode();
19593     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19594     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19595     SDValue Args[] = { Chain, Offset };
19596     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19597     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19598                                DAG.getIntPtrConstant(0, DL, true),
19599                                Chain.getValue(1), DL);
19600 
19601     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19602     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19603     MFI.setAdjustsStack(true);
19604 
19605     // And our return value (tls address) is in the standard call return value
19606     // location.
19607     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19608     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19609   }
19610 
19611   if (Subtarget.isOSWindows()) {
19612     // Just use the implicit TLS architecture
19613     // Need to generate something similar to:
19614     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19615     //                                  ; from TEB
19616     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
19617     //   mov     rcx, qword [rdx+rcx*8]
19618     //   mov     eax, .tls$:tlsvar
19619     //   [rax+rcx] contains the address
19620     // Windows 64bit: gs:0x58
19621     // Windows 32bit: fs:__tls_array
19622 
19623     SDLoc dl(GA);
19624     SDValue Chain = DAG.getEntryNode();
19625 
19626     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19627     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19628     // use its literal value of 0x2C.
19629     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19630                                         ? Type::getInt8PtrTy(*DAG.getContext(),
19631                                                              256)
19632                                         : Type::getInt32PtrTy(*DAG.getContext(),
19633                                                               257));
19634 
19635     SDValue TlsArray = Subtarget.is64Bit()
19636                            ? DAG.getIntPtrConstant(0x58, dl)
19637                            : (Subtarget.isTargetWindowsGNU()
19638                                   ? DAG.getIntPtrConstant(0x2C, dl)
19639                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
19640 
19641     SDValue ThreadPointer =
19642         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19643 
19644     SDValue res;
19645     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19646       res = ThreadPointer;
19647     } else {
19648       // Load the _tls_index variable
19649       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19650       if (Subtarget.is64Bit())
19651         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19652                              MachinePointerInfo(), MVT::i32);
19653       else
19654         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19655 
19656       const DataLayout &DL = DAG.getDataLayout();
19657       SDValue Scale =
19658           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19659       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19660 
19661       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19662     }
19663 
19664     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19665 
19666     // Get the offset of start of .tls section
19667     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19668                                              GA->getValueType(0),
19669                                              GA->getOffset(), X86II::MO_SECREL);
19670     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19671 
19672     // The address of the thread local variable is the add of the thread
19673     // pointer with the offset of the variable.
19674     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19675   }
19676 
19677   llvm_unreachable("TLS not implemented for this target.");
19678 }
19679 
19680 /// Lower SRA_PARTS and friends, which return two i32 values
19681 /// and take a 2 x i32 value to shift plus a shift amount.
19682 /// TODO: Can this be moved to general expansion code?
19683 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19684   SDValue Lo, Hi;
19685   DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19686   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19687 }
19688 
19689 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19690                                 SelectionDAG &DAG) {
19691   MVT VT = Op.getSimpleValueType();
19692   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
19693          "Unexpected funnel shift opcode!");
19694 
19695   SDLoc DL(Op);
19696   SDValue Op0 = Op.getOperand(0);
19697   SDValue Op1 = Op.getOperand(1);
19698   SDValue Amt = Op.getOperand(2);
19699 
19700   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19701 
19702   if (VT.isVector()) {
19703     assert(Subtarget.hasVBMI2() && "Expected VBMI2");
19704 
19705     if (IsFSHR)
19706       std::swap(Op0, Op1);
19707 
19708     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19709     if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19710       Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19711       Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19712     }
19713 
19714     SDValue Funnel;
19715     APInt APIntShiftAmt;
19716     MVT ResultVT = Op0.getSimpleValueType();
19717     if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19718       uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19719       Funnel =
19720           DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19721                       Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19722     } else {
19723       if (!Subtarget.hasVLX() && !VT.is512BitVector())
19724         Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19725       Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19726                            ResultVT, Op0, Op1, Amt);
19727     }
19728     if (!Subtarget.hasVLX() && !VT.is512BitVector())
19729       Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19730     return Funnel;
19731   }
19732   assert(
19733       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
19734       "Unexpected funnel shift type!");
19735 
19736   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19737   bool OptForSize = DAG.shouldOptForSize();
19738   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19739 
19740   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19741   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19742   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19743       !isa<ConstantSDNode>(Amt)) {
19744     unsigned EltSizeInBits = VT.getScalarSizeInBits();
19745     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19746     SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19747     Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19748     Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19749     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19750     SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19751     Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19752     if (IsFSHR) {
19753       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19754     } else {
19755       Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19756       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19757     }
19758     return DAG.getZExtOrTrunc(Res, DL, VT);
19759   }
19760 
19761   if (VT == MVT::i8 || ExpandFunnel)
19762     return SDValue();
19763 
19764   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19765   if (VT == MVT::i16) {
19766     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19767                       DAG.getConstant(15, DL, Amt.getValueType()));
19768     unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19769     return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19770   }
19771 
19772   return Op;
19773 }
19774 
19775 // Try to use a packed vector operation to handle i64 on 32-bit targets when
19776 // AVX512DQ is enabled.
19777 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19778                                         const X86Subtarget &Subtarget) {
19779   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19780           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19781           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19782           Op.getOpcode() == ISD::UINT_TO_FP) &&
19783          "Unexpected opcode!");
19784   bool IsStrict = Op->isStrictFPOpcode();
19785   unsigned OpNo = IsStrict ? 1 : 0;
19786   SDValue Src = Op.getOperand(OpNo);
19787   MVT SrcVT = Src.getSimpleValueType();
19788   MVT VT = Op.getSimpleValueType();
19789 
19790    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19791        (VT != MVT::f32 && VT != MVT::f64))
19792     return SDValue();
19793 
19794   // Pack the i64 into a vector, do the operation and extract.
19795 
19796   // Using 256-bit to ensure result is 128-bits for f32 case.
19797   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19798   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19799   MVT VecVT = MVT::getVectorVT(VT, NumElts);
19800 
19801   SDLoc dl(Op);
19802   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19803   if (IsStrict) {
19804     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19805                                  {Op.getOperand(0), InVec});
19806     SDValue Chain = CvtVec.getValue(1);
19807     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19808                                 DAG.getIntPtrConstant(0, dl));
19809     return DAG.getMergeValues({Value, Chain}, dl);
19810   }
19811 
19812   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19813 
19814   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19815                      DAG.getIntPtrConstant(0, dl));
19816 }
19817 
19818 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19819                           const X86Subtarget &Subtarget) {
19820   switch (Opcode) {
19821     case ISD::SINT_TO_FP:
19822       // TODO: Handle wider types with AVX/AVX512.
19823       if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19824         return false;
19825       // CVTDQ2PS or (V)CVTDQ2PD
19826       return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19827 
19828     case ISD::UINT_TO_FP:
19829       // TODO: Handle wider types and i64 elements.
19830       if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19831         return false;
19832       // VCVTUDQ2PS or VCVTUDQ2PD
19833       return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19834 
19835     default:
19836       return false;
19837   }
19838 }
19839 
19840 /// Given a scalar cast operation that is extracted from a vector, try to
19841 /// vectorize the cast op followed by extraction. This will avoid an expensive
19842 /// round-trip between XMM and GPR.
19843 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19844                                       const X86Subtarget &Subtarget) {
19845   // TODO: This could be enhanced to handle smaller integer types by peeking
19846   // through an extend.
19847   SDValue Extract = Cast.getOperand(0);
19848   MVT DestVT = Cast.getSimpleValueType();
19849   if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19850       !isa<ConstantSDNode>(Extract.getOperand(1)))
19851     return SDValue();
19852 
19853   // See if we have a 128-bit vector cast op for this type of cast.
19854   SDValue VecOp = Extract.getOperand(0);
19855   MVT FromVT = VecOp.getSimpleValueType();
19856   unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19857   MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19858   MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19859   if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19860     return SDValue();
19861 
19862   // If we are extracting from a non-zero element, first shuffle the source
19863   // vector to allow extracting from element zero.
19864   SDLoc DL(Cast);
19865   if (!isNullConstant(Extract.getOperand(1))) {
19866     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19867     Mask[0] = Extract.getConstantOperandVal(1);
19868     VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19869   }
19870   // If the source vector is wider than 128-bits, extract the low part. Do not
19871   // create an unnecessarily wide vector cast op.
19872   if (FromVT != Vec128VT)
19873     VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19874 
19875   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19876   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19877   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19878   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19879                      DAG.getIntPtrConstant(0, DL));
19880 }
19881 
19882 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19883 /// try to vectorize the cast ops. This will avoid an expensive round-trip
19884 /// between XMM and GPR.
19885 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19886                                 const X86Subtarget &Subtarget) {
19887   // TODO: Allow FP_TO_UINT.
19888   SDValue CastToInt = CastToFP.getOperand(0);
19889   MVT VT = CastToFP.getSimpleValueType();
19890   if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19891     return SDValue();
19892 
19893   MVT IntVT = CastToInt.getSimpleValueType();
19894   SDValue X = CastToInt.getOperand(0);
19895   MVT SrcVT = X.getSimpleValueType();
19896   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19897     return SDValue();
19898 
19899   // See if we have 128-bit vector cast instructions for this type of cast.
19900   // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19901   if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19902       IntVT != MVT::i32)
19903     return SDValue();
19904 
19905   unsigned SrcSize = SrcVT.getSizeInBits();
19906   unsigned IntSize = IntVT.getSizeInBits();
19907   unsigned VTSize = VT.getSizeInBits();
19908   MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19909   MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19910   MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19911 
19912   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19913   unsigned ToIntOpcode =
19914       SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19915   unsigned ToFPOpcode =
19916       IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19917 
19918   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19919   //
19920   // We are not defining the high elements (for example, zero them) because
19921   // that could nullify any performance advantage that we hoped to gain from
19922   // this vector op hack. We do not expect any adverse effects (like denorm
19923   // penalties) with cast ops.
19924   SDLoc DL(CastToFP);
19925   SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19926   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19927   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19928   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19929   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19930 }
19931 
19932 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19933                                     const X86Subtarget &Subtarget) {
19934   SDLoc DL(Op);
19935   bool IsStrict = Op->isStrictFPOpcode();
19936   MVT VT = Op->getSimpleValueType(0);
19937   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19938 
19939   if (Subtarget.hasDQI()) {
19940     assert(!Subtarget.hasVLX() && "Unexpected features");
19941 
19942     assert((Src.getSimpleValueType() == MVT::v2i64 ||
19943             Src.getSimpleValueType() == MVT::v4i64) &&
19944            "Unsupported custom type");
19945 
19946     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19947     assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19948            "Unexpected VT!");
19949     MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19950 
19951     // Need to concat with zero vector for strict fp to avoid spurious
19952     // exceptions.
19953     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19954                            : DAG.getUNDEF(MVT::v8i64);
19955     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19956                       DAG.getIntPtrConstant(0, DL));
19957     SDValue Res, Chain;
19958     if (IsStrict) {
19959       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19960                         {Op->getOperand(0), Src});
19961       Chain = Res.getValue(1);
19962     } else {
19963       Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19964     }
19965 
19966     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19967                       DAG.getIntPtrConstant(0, DL));
19968 
19969     if (IsStrict)
19970       return DAG.getMergeValues({Res, Chain}, DL);
19971     return Res;
19972   }
19973 
19974   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19975                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19976   if (VT != MVT::v4f32 || IsSigned)
19977     return SDValue();
19978 
19979   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19980   SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
19981   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19982                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19983                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19984   SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19985   SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19986   SmallVector<SDValue, 4> SignCvts(4);
19987   SmallVector<SDValue, 4> Chains(4);
19988   for (int i = 0; i != 4; ++i) {
19989     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19990                               DAG.getIntPtrConstant(i, DL));
19991     if (IsStrict) {
19992       SignCvts[i] =
19993           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19994                       {Op.getOperand(0), Elt});
19995       Chains[i] = SignCvts[i].getValue(1);
19996     } else {
19997       SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19998     }
19999   }
20000   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20001 
20002   SDValue Slow, Chain;
20003   if (IsStrict) {
20004     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20005     Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20006                        {Chain, SignCvt, SignCvt});
20007     Chain = Slow.getValue(1);
20008   } else {
20009     Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20010   }
20011 
20012   IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20013   SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20014 
20015   if (IsStrict)
20016     return DAG.getMergeValues({Cvt, Chain}, DL);
20017 
20018   return Cvt;
20019 }
20020 
20021 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20022                                            SelectionDAG &DAG) const {
20023   bool IsStrict = Op->isStrictFPOpcode();
20024   unsigned OpNo = IsStrict ? 1 : 0;
20025   SDValue Src = Op.getOperand(OpNo);
20026   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20027   MVT SrcVT = Src.getSimpleValueType();
20028   MVT VT = Op.getSimpleValueType();
20029   SDLoc dl(Op);
20030 
20031   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20032     return Extract;
20033 
20034   if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20035     return R;
20036 
20037   if (SrcVT.isVector()) {
20038     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20039       // Note: Since v2f64 is a legal type. We don't need to zero extend the
20040       // source for strict FP.
20041       if (IsStrict)
20042         return DAG.getNode(
20043             X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20044             {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20045                                 DAG.getUNDEF(SrcVT))});
20046       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20047                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20048                                      DAG.getUNDEF(SrcVT)));
20049     }
20050     if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20051       return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20052 
20053     return SDValue();
20054   }
20055 
20056   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20057          "Unknown SINT_TO_FP to lower!");
20058 
20059   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20060 
20061   // These are really Legal; return the operand so the caller accepts it as
20062   // Legal.
20063   if (SrcVT == MVT::i32 && UseSSEReg)
20064     return Op;
20065   if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20066     return Op;
20067 
20068   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20069     return V;
20070 
20071   // SSE doesn't have an i16 conversion so we need to promote.
20072   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20073     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20074     if (IsStrict)
20075       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20076                          {Chain, Ext});
20077 
20078     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20079   }
20080 
20081   if (VT == MVT::f128)
20082     return SDValue();
20083 
20084   SDValue ValueToStore = Src;
20085   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20086     // Bitcasting to f64 here allows us to do a single 64-bit store from
20087     // an SSE register, avoiding the store forwarding penalty that would come
20088     // with two 32-bit stores.
20089     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20090 
20091   unsigned Size = SrcVT.getStoreSize();
20092   Align Alignment(Size);
20093   MachineFunction &MF = DAG.getMachineFunction();
20094   auto PtrVT = getPointerTy(MF.getDataLayout());
20095   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20096   MachinePointerInfo MPI =
20097       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20098   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20099   Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20100   std::pair<SDValue, SDValue> Tmp =
20101       BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20102 
20103   if (IsStrict)
20104     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20105 
20106   return Tmp.first;
20107 }
20108 
20109 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20110     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20111     MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20112   // Build the FILD
20113   SDVTList Tys;
20114   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20115   if (useSSE)
20116     Tys = DAG.getVTList(MVT::f80, MVT::Other);
20117   else
20118     Tys = DAG.getVTList(DstVT, MVT::Other);
20119 
20120   SDValue FILDOps[] = {Chain, Pointer};
20121   SDValue Result =
20122       DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20123                               Alignment, MachineMemOperand::MOLoad);
20124   Chain = Result.getValue(1);
20125 
20126   if (useSSE) {
20127     MachineFunction &MF = DAG.getMachineFunction();
20128     unsigned SSFISize = DstVT.getStoreSize();
20129     int SSFI =
20130         MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20131     auto PtrVT = getPointerTy(MF.getDataLayout());
20132     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20133     Tys = DAG.getVTList(MVT::Other);
20134     SDValue FSTOps[] = {Chain, Result, StackSlot};
20135     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20136         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20137         MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20138 
20139     Chain =
20140         DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20141     Result = DAG.getLoad(
20142         DstVT, DL, Chain, StackSlot,
20143         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20144     Chain = Result.getValue(1);
20145   }
20146 
20147   return { Result, Chain };
20148 }
20149 
20150 /// Horizontal vector math instructions may be slower than normal math with
20151 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20152 /// implementation, and likely shuffle complexity of the alternate sequence.
20153 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20154                                   const X86Subtarget &Subtarget) {
20155   bool IsOptimizingSize = DAG.shouldOptForSize();
20156   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20157   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20158 }
20159 
20160 /// 64-bit unsigned integer to double expansion.
20161 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20162                                    const X86Subtarget &Subtarget) {
20163   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20164   // when converting 0 when rounding toward negative infinity. Caller will
20165   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20166   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20167   // This algorithm is not obvious. Here it is what we're trying to output:
20168   /*
20169      movq       %rax,  %xmm0
20170      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20171      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20172      #ifdef __SSE3__
20173        haddpd   %xmm0, %xmm0
20174      #else
20175        pshufd   $0x4e, %xmm0, %xmm1
20176        addpd    %xmm1, %xmm0
20177      #endif
20178   */
20179 
20180   SDLoc dl(Op);
20181   LLVMContext *Context = DAG.getContext();
20182 
20183   // Build some magic constants.
20184   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20185   Constant *C0 = ConstantDataVector::get(*Context, CV0);
20186   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20187   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20188 
20189   SmallVector<Constant*,2> CV1;
20190   CV1.push_back(
20191     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20192                                       APInt(64, 0x4330000000000000ULL))));
20193   CV1.push_back(
20194     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20195                                       APInt(64, 0x4530000000000000ULL))));
20196   Constant *C1 = ConstantVector::get(CV1);
20197   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20198 
20199   // Load the 64-bit value into an XMM register.
20200   SDValue XR1 =
20201       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20202   SDValue CLod0 = DAG.getLoad(
20203       MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20204       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20205   SDValue Unpck1 =
20206       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20207 
20208   SDValue CLod1 = DAG.getLoad(
20209       MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20210       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20211   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20212   // TODO: Are there any fast-math-flags to propagate here?
20213   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20214   SDValue Result;
20215 
20216   if (Subtarget.hasSSE3() &&
20217       shouldUseHorizontalOp(true, DAG, Subtarget)) {
20218     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20219   } else {
20220     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20221     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20222   }
20223   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20224                        DAG.getIntPtrConstant(0, dl));
20225   return Result;
20226 }
20227 
20228 /// 32-bit unsigned integer to float expansion.
20229 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20230                                    const X86Subtarget &Subtarget) {
20231   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20232   SDLoc dl(Op);
20233   // FP constant to bias correct the final result.
20234   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20235                                    MVT::f64);
20236 
20237   // Load the 32-bit value into an XMM register.
20238   SDValue Load =
20239       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20240 
20241   // Zero out the upper parts of the register.
20242   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20243 
20244   // Or the load with the bias.
20245   SDValue Or = DAG.getNode(
20246       ISD::OR, dl, MVT::v2i64,
20247       DAG.getBitcast(MVT::v2i64, Load),
20248       DAG.getBitcast(MVT::v2i64,
20249                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20250   Or =
20251       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20252                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20253 
20254   if (Op.getNode()->isStrictFPOpcode()) {
20255     // Subtract the bias.
20256     // TODO: Are there any fast-math-flags to propagate here?
20257     SDValue Chain = Op.getOperand(0);
20258     SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20259                               {Chain, Or, Bias});
20260 
20261     if (Op.getValueType() == Sub.getValueType())
20262       return Sub;
20263 
20264     // Handle final rounding.
20265     std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20266         Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20267 
20268     return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20269   }
20270 
20271   // Subtract the bias.
20272   // TODO: Are there any fast-math-flags to propagate here?
20273   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20274 
20275   // Handle final rounding.
20276   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20277 }
20278 
20279 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20280                                      const X86Subtarget &Subtarget,
20281                                      const SDLoc &DL) {
20282   if (Op.getSimpleValueType() != MVT::v2f64)
20283     return SDValue();
20284 
20285   bool IsStrict = Op->isStrictFPOpcode();
20286 
20287   SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20288   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20289 
20290   if (Subtarget.hasAVX512()) {
20291     if (!Subtarget.hasVLX()) {
20292       // Let generic type legalization widen this.
20293       if (!IsStrict)
20294         return SDValue();
20295       // Otherwise pad the integer input with 0s and widen the operation.
20296       N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20297                        DAG.getConstant(0, DL, MVT::v2i32));
20298       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20299                                 {Op.getOperand(0), N0});
20300       SDValue Chain = Res.getValue(1);
20301       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20302                         DAG.getIntPtrConstant(0, DL));
20303       return DAG.getMergeValues({Res, Chain}, DL);
20304     }
20305 
20306     // Legalize to v4i32 type.
20307     N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20308                      DAG.getUNDEF(MVT::v2i32));
20309     if (IsStrict)
20310       return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20311                          {Op.getOperand(0), N0});
20312     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20313   }
20314 
20315   // Zero extend to 2i64, OR with the floating point representation of 2^52.
20316   // This gives us the floating point equivalent of 2^52 + the i32 integer
20317   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20318   // point leaving just our i32 integers in double format.
20319   SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20320   SDValue VBias =
20321       DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20322   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20323                            DAG.getBitcast(MVT::v2i64, VBias));
20324   Or = DAG.getBitcast(MVT::v2f64, Or);
20325 
20326   if (IsStrict)
20327     return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20328                        {Op.getOperand(0), Or, VBias});
20329   return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20330 }
20331 
20332 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20333                                      const X86Subtarget &Subtarget) {
20334   SDLoc DL(Op);
20335   bool IsStrict = Op->isStrictFPOpcode();
20336   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20337   MVT VecIntVT = V.getSimpleValueType();
20338   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20339          "Unsupported custom type");
20340 
20341   if (Subtarget.hasAVX512()) {
20342     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20343     assert(!Subtarget.hasVLX() && "Unexpected features");
20344     MVT VT = Op->getSimpleValueType(0);
20345 
20346     // v8i32->v8f64 is legal with AVX512 so just return it.
20347     if (VT == MVT::v8f64)
20348       return Op;
20349 
20350     assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20351            "Unexpected VT!");
20352     MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20353     MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20354     // Need to concat with zero vector for strict fp to avoid spurious
20355     // exceptions.
20356     SDValue Tmp =
20357         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20358     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20359                     DAG.getIntPtrConstant(0, DL));
20360     SDValue Res, Chain;
20361     if (IsStrict) {
20362       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20363                         {Op->getOperand(0), V});
20364       Chain = Res.getValue(1);
20365     } else {
20366       Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20367     }
20368 
20369     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20370                       DAG.getIntPtrConstant(0, DL));
20371 
20372     if (IsStrict)
20373       return DAG.getMergeValues({Res, Chain}, DL);
20374     return Res;
20375   }
20376 
20377   if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20378       Op->getSimpleValueType(0) == MVT::v4f64) {
20379     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20380     Constant *Bias = ConstantFP::get(
20381         *DAG.getContext(),
20382         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20383     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20384     SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20385     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20386     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20387     SDValue VBias = DAG.getMemIntrinsicNode(
20388         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20389         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20390         MachineMemOperand::MOLoad);
20391 
20392     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20393                              DAG.getBitcast(MVT::v4i64, VBias));
20394     Or = DAG.getBitcast(MVT::v4f64, Or);
20395 
20396     if (IsStrict)
20397       return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20398                          {Op.getOperand(0), Or, VBias});
20399     return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20400   }
20401 
20402   // The algorithm is the following:
20403   // #ifdef __SSE4_1__
20404   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20405   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20406   //                                 (uint4) 0x53000000, 0xaa);
20407   // #else
20408   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20409   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20410   // #endif
20411   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20412   //     return (float4) lo + fhi;
20413 
20414   bool Is128 = VecIntVT == MVT::v4i32;
20415   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20416   // If we convert to something else than the supported type, e.g., to v4f64,
20417   // abort early.
20418   if (VecFloatVT != Op->getSimpleValueType(0))
20419     return SDValue();
20420 
20421   // In the #idef/#else code, we have in common:
20422   // - The vector of constants:
20423   // -- 0x4b000000
20424   // -- 0x53000000
20425   // - A shift:
20426   // -- v >> 16
20427 
20428   // Create the splat vector for 0x4b000000.
20429   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20430   // Create the splat vector for 0x53000000.
20431   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20432 
20433   // Create the right shift.
20434   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20435   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20436 
20437   SDValue Low, High;
20438   if (Subtarget.hasSSE41()) {
20439     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20440     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20441     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20442     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20443     // Low will be bitcasted right away, so do not bother bitcasting back to its
20444     // original type.
20445     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20446                       VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20447     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20448     //                                 (uint4) 0x53000000, 0xaa);
20449     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20450     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20451     // High will be bitcasted right away, so do not bother bitcasting back to
20452     // its original type.
20453     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20454                        VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20455   } else {
20456     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20457     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20458     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20459     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20460 
20461     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
20462     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20463   }
20464 
20465   // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20466   SDValue VecCstFSub = DAG.getConstantFP(
20467       APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20468 
20469   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20470   // NOTE: By using fsub of a positive constant instead of fadd of a negative
20471   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20472   // enabled. See PR24512.
20473   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20474   // TODO: Are there any fast-math-flags to propagate here?
20475   //     (float4) lo;
20476   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20477   //     return (float4) lo + fhi;
20478   if (IsStrict) {
20479     SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20480                                 {Op.getOperand(0), HighBitcast, VecCstFSub});
20481     return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20482                        {FHigh.getValue(1), LowBitcast, FHigh});
20483   }
20484 
20485   SDValue FHigh =
20486       DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20487   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20488 }
20489 
20490 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20491                                    const X86Subtarget &Subtarget) {
20492   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20493   SDValue N0 = Op.getOperand(OpNo);
20494   MVT SrcVT = N0.getSimpleValueType();
20495   SDLoc dl(Op);
20496 
20497   switch (SrcVT.SimpleTy) {
20498   default:
20499     llvm_unreachable("Custom UINT_TO_FP is not supported!");
20500   case MVT::v2i32:
20501     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20502   case MVT::v4i32:
20503   case MVT::v8i32:
20504     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20505   case MVT::v2i64:
20506   case MVT::v4i64:
20507     return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20508   }
20509 }
20510 
20511 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20512                                            SelectionDAG &DAG) const {
20513   bool IsStrict = Op->isStrictFPOpcode();
20514   unsigned OpNo = IsStrict ? 1 : 0;
20515   SDValue Src = Op.getOperand(OpNo);
20516   SDLoc dl(Op);
20517   auto PtrVT = getPointerTy(DAG.getDataLayout());
20518   MVT SrcVT = Src.getSimpleValueType();
20519   MVT DstVT = Op->getSimpleValueType(0);
20520   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20521 
20522   if (DstVT == MVT::f128)
20523     return SDValue();
20524 
20525   if (DstVT.isVector())
20526     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20527 
20528   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20529     return Extract;
20530 
20531   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20532       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20533     // Conversions from unsigned i32 to f32/f64 are legal,
20534     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
20535     return Op;
20536   }
20537 
20538   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20539   if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20540     Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20541     if (IsStrict)
20542       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20543                          {Chain, Src});
20544     return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20545   }
20546 
20547   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20548     return V;
20549 
20550   // The transform for i64->f64 isn't correct for 0 when rounding to negative
20551   // infinity. It produces -0.0, so disable under strictfp.
20552   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20553     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20554   if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20555     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20556   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20557       (DstVT == MVT::f32 || DstVT == MVT::f64))
20558     return SDValue();
20559 
20560   // Make a 64-bit buffer, and use it to build an FILD.
20561   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20562   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20563   Align SlotAlign(8);
20564   MachinePointerInfo MPI =
20565     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20566   if (SrcVT == MVT::i32) {
20567     SDValue OffsetSlot =
20568         DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20569     SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20570     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20571                                   OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20572     std::pair<SDValue, SDValue> Tmp =
20573         BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20574     if (IsStrict)
20575       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20576 
20577     return Tmp.first;
20578   }
20579 
20580   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20581   SDValue ValueToStore = Src;
20582   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20583     // Bitcasting to f64 here allows us to do a single 64-bit store from
20584     // an SSE register, avoiding the store forwarding penalty that would come
20585     // with two 32-bit stores.
20586     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20587   }
20588   SDValue Store =
20589       DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20590   // For i64 source, we need to add the appropriate power of 2 if the input
20591   // was negative. We must be careful to do the computation in x87 extended
20592   // precision, not in SSE.
20593   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20594   SDValue Ops[] = { Store, StackSlot };
20595   SDValue Fild =
20596       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20597                               SlotAlign, MachineMemOperand::MOLoad);
20598   Chain = Fild.getValue(1);
20599 
20600 
20601   // Check whether the sign bit is set.
20602   SDValue SignSet = DAG.getSetCC(
20603       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20604       Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20605 
20606   // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20607   APInt FF(64, 0x5F80000000000000ULL);
20608   SDValue FudgePtr = DAG.getConstantPool(
20609       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20610   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20611 
20612   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20613   SDValue Zero = DAG.getIntPtrConstant(0, dl);
20614   SDValue Four = DAG.getIntPtrConstant(4, dl);
20615   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20616   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20617 
20618   // Load the value out, extending it from f32 to f80.
20619   SDValue Fudge = DAG.getExtLoad(
20620       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20621       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20622       CPAlignment);
20623   Chain = Fudge.getValue(1);
20624   // Extend everything to 80 bits to force it to be done on x87.
20625   // TODO: Are there any fast-math-flags to propagate here?
20626   if (IsStrict) {
20627     SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20628                               {Chain, Fild, Fudge});
20629     // STRICT_FP_ROUND can't handle equal types.
20630     if (DstVT == MVT::f80)
20631       return Add;
20632     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20633                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20634   }
20635   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20636   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20637                      DAG.getIntPtrConstant(0, dl));
20638 }
20639 
20640 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20641 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20642 // just return an SDValue().
20643 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20644 // to i16, i32 or i64, and we lower it to a legal sequence and return the
20645 // result.
20646 SDValue
20647 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20648                                    bool IsSigned, SDValue &Chain) const {
20649   bool IsStrict = Op->isStrictFPOpcode();
20650   SDLoc DL(Op);
20651 
20652   EVT DstTy = Op.getValueType();
20653   SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20654   EVT TheVT = Value.getValueType();
20655   auto PtrVT = getPointerTy(DAG.getDataLayout());
20656 
20657   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20658     // f16 must be promoted before using the lowering in this routine.
20659     // fp128 does not use this lowering.
20660     return SDValue();
20661   }
20662 
20663   // If using FIST to compute an unsigned i64, we'll need some fixup
20664   // to handle values above the maximum signed i64.  A FIST is always
20665   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20666   bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20667 
20668   // FIXME: This does not generate an invalid exception if the input does not
20669   // fit in i32. PR44019
20670   if (!IsSigned && DstTy != MVT::i64) {
20671     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20672     // The low 32 bits of the fist result will have the correct uint32 result.
20673     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20674     DstTy = MVT::i64;
20675   }
20676 
20677   assert(DstTy.getSimpleVT() <= MVT::i64 &&
20678          DstTy.getSimpleVT() >= MVT::i16 &&
20679          "Unknown FP_TO_INT to lower!");
20680 
20681   // We lower FP->int64 into FISTP64 followed by a load from a temporary
20682   // stack slot.
20683   MachineFunction &MF = DAG.getMachineFunction();
20684   unsigned MemSize = DstTy.getStoreSize();
20685   int SSFI =
20686       MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20687   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20688 
20689   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20690 
20691   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20692 
20693   if (UnsignedFixup) {
20694     //
20695     // Conversion to unsigned i64 is implemented with a select,
20696     // depending on whether the source value fits in the range
20697     // of a signed i64.  Let Thresh be the FP equivalent of
20698     // 0x8000000000000000ULL.
20699     //
20700     //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20701     //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20702     //  FistSrc = (Value - FltOfs);
20703     //  Fist-to-mem64 FistSrc
20704     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20705     //  to XOR'ing the high 32 bits with Adjust.
20706     //
20707     // Being a power of 2, Thresh is exactly representable in all FP formats.
20708     // For X87 we'd like to use the smallest FP type for this constant, but
20709     // for DAG type consistency we have to match the FP operand type.
20710 
20711     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20712     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
20713     bool LosesInfo = false;
20714     if (TheVT == MVT::f64)
20715       // The rounding mode is irrelevant as the conversion should be exact.
20716       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20717                               &LosesInfo);
20718     else if (TheVT == MVT::f80)
20719       Status = Thresh.convert(APFloat::x87DoubleExtended(),
20720                               APFloat::rmNearestTiesToEven, &LosesInfo);
20721 
20722     assert(Status == APFloat::opOK && !LosesInfo &&
20723            "FP conversion should have been exact");
20724 
20725     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20726 
20727     EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20728                                    *DAG.getContext(), TheVT);
20729     SDValue Cmp;
20730     if (IsStrict) {
20731       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20732                          /*IsSignaling*/ true);
20733       Chain = Cmp.getValue(1);
20734     } else {
20735       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20736     }
20737 
20738     // Our preferred lowering of
20739     //
20740     // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20741     //
20742     // is
20743     //
20744     // (Value >= Thresh) << 63
20745     //
20746     // but since we can get here after LegalOperations, DAGCombine might do the
20747     // wrong thing if we create a select. So, directly create the preferred
20748     // version.
20749     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20750     SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20751     Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20752 
20753     SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20754                                    DAG.getConstantFP(0.0, DL, TheVT));
20755 
20756     if (IsStrict) {
20757       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20758                           { Chain, Value, FltOfs });
20759       Chain = Value.getValue(1);
20760     } else
20761       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20762   }
20763 
20764   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20765 
20766   // FIXME This causes a redundant load/store if the SSE-class value is already
20767   // in memory, such as if it is on the callstack.
20768   if (isScalarFPTypeInSSEReg(TheVT)) {
20769     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20770     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20771     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20772     SDValue Ops[] = { Chain, StackSlot };
20773 
20774     unsigned FLDSize = TheVT.getStoreSize();
20775     assert(FLDSize <= MemSize && "Stack slot not big enough");
20776     MachineMemOperand *MMO = MF.getMachineMemOperand(
20777         MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20778     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20779     Chain = Value.getValue(1);
20780   }
20781 
20782   // Build the FP_TO_INT*_IN_MEM
20783   MachineMemOperand *MMO = MF.getMachineMemOperand(
20784       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20785   SDValue Ops[] = { Chain, Value, StackSlot };
20786   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20787                                          DAG.getVTList(MVT::Other),
20788                                          Ops, DstTy, MMO);
20789 
20790   SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20791   Chain = Res.getValue(1);
20792 
20793   // If we need an unsigned fixup, XOR the result with adjust.
20794   if (UnsignedFixup)
20795     Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20796 
20797   return Res;
20798 }
20799 
20800 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20801                               const X86Subtarget &Subtarget) {
20802   MVT VT = Op.getSimpleValueType();
20803   SDValue In = Op.getOperand(0);
20804   MVT InVT = In.getSimpleValueType();
20805   SDLoc dl(Op);
20806   unsigned Opc = Op.getOpcode();
20807 
20808   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20809   assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20810          "Unexpected extension opcode");
20811   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
20812          "Expected same number of elements");
20813   assert((VT.getVectorElementType() == MVT::i16 ||
20814           VT.getVectorElementType() == MVT::i32 ||
20815           VT.getVectorElementType() == MVT::i64) &&
20816          "Unexpected element type");
20817   assert((InVT.getVectorElementType() == MVT::i8 ||
20818           InVT.getVectorElementType() == MVT::i16 ||
20819           InVT.getVectorElementType() == MVT::i32) &&
20820          "Unexpected element type");
20821 
20822   unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20823 
20824   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20825     assert(InVT == MVT::v32i8 && "Unexpected VT!");
20826     return splitVectorIntUnary(Op, DAG);
20827   }
20828 
20829   if (Subtarget.hasInt256())
20830     return Op;
20831 
20832   // Optimize vectors in AVX mode:
20833   //
20834   //   v8i16 -> v8i32
20835   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
20836   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
20837   //   Concat upper and lower parts.
20838   //
20839   //   v4i32 -> v4i64
20840   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
20841   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
20842   //   Concat upper and lower parts.
20843   //
20844   MVT HalfVT = VT.getHalfNumVectorElementsVT();
20845   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20846 
20847   // Short-circuit if we can determine that each 128-bit half is the same value.
20848   // Otherwise, this is difficult to match and optimize.
20849   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20850     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20851       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20852 
20853   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20854   SDValue Undef = DAG.getUNDEF(InVT);
20855   bool NeedZero = Opc == ISD::ZERO_EXTEND;
20856   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20857   OpHi = DAG.getBitcast(HalfVT, OpHi);
20858 
20859   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20860 }
20861 
20862 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20863 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20864                                    const SDLoc &dl, SelectionDAG &DAG) {
20865   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20866   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20867                            DAG.getIntPtrConstant(0, dl));
20868   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20869                            DAG.getIntPtrConstant(8, dl));
20870   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20871   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20872   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20873   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20874 }
20875 
20876 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20877                                       const X86Subtarget &Subtarget,
20878                                       SelectionDAG &DAG) {
20879   MVT VT = Op->getSimpleValueType(0);
20880   SDValue In = Op->getOperand(0);
20881   MVT InVT = In.getSimpleValueType();
20882   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20883   SDLoc DL(Op);
20884   unsigned NumElts = VT.getVectorNumElements();
20885 
20886   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20887   // avoids a constant pool load.
20888   if (VT.getVectorElementType() != MVT::i8) {
20889     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20890     return DAG.getNode(ISD::SRL, DL, VT, Extend,
20891                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20892   }
20893 
20894   // Extend VT if BWI is not supported.
20895   MVT ExtVT = VT;
20896   if (!Subtarget.hasBWI()) {
20897     // If v16i32 is to be avoided, we'll need to split and concatenate.
20898     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20899       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20900 
20901     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20902   }
20903 
20904   // Widen to 512-bits if VLX is not supported.
20905   MVT WideVT = ExtVT;
20906   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20907     NumElts *= 512 / ExtVT.getSizeInBits();
20908     InVT = MVT::getVectorVT(MVT::i1, NumElts);
20909     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20910                      In, DAG.getIntPtrConstant(0, DL));
20911     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20912                               NumElts);
20913   }
20914 
20915   SDValue One = DAG.getConstant(1, DL, WideVT);
20916   SDValue Zero = DAG.getConstant(0, DL, WideVT);
20917 
20918   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20919 
20920   // Truncate if we had to extend above.
20921   if (VT != ExtVT) {
20922     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20923     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20924   }
20925 
20926   // Extract back to 128/256-bit if we widened.
20927   if (WideVT != VT)
20928     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20929                               DAG.getIntPtrConstant(0, DL));
20930 
20931   return SelectedVal;
20932 }
20933 
20934 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20935                                 SelectionDAG &DAG) {
20936   SDValue In = Op.getOperand(0);
20937   MVT SVT = In.getSimpleValueType();
20938 
20939   if (SVT.getVectorElementType() == MVT::i1)
20940     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20941 
20942   assert(Subtarget.hasAVX() && "Expected AVX support");
20943   return LowerAVXExtend(Op, DAG, Subtarget);
20944 }
20945 
20946 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20947 /// It makes use of the fact that vectors with enough leading sign/zero bits
20948 /// prevent the PACKSS/PACKUS from saturating the results.
20949 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20950 /// within each 128-bit lane.
20951 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20952                                       const SDLoc &DL, SelectionDAG &DAG,
20953                                       const X86Subtarget &Subtarget) {
20954   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20955          "Unexpected PACK opcode");
20956   assert(DstVT.isVector() && "VT not a vector?");
20957 
20958   // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20959   if (!Subtarget.hasSSE2())
20960     return SDValue();
20961 
20962   EVT SrcVT = In.getValueType();
20963 
20964   // No truncation required, we might get here due to recursive calls.
20965   if (SrcVT == DstVT)
20966     return In;
20967 
20968   // We only support vector truncation to 64bits or greater from a
20969   // 128bits or greater source.
20970   unsigned DstSizeInBits = DstVT.getSizeInBits();
20971   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20972   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20973     return SDValue();
20974 
20975   unsigned NumElems = SrcVT.getVectorNumElements();
20976   if (!isPowerOf2_32(NumElems))
20977     return SDValue();
20978 
20979   LLVMContext &Ctx = *DAG.getContext();
20980   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20981   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20982 
20983   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20984 
20985   // Pack to the largest type possible:
20986   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20987   EVT InVT = MVT::i16, OutVT = MVT::i8;
20988   if (SrcVT.getScalarSizeInBits() > 16 &&
20989       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20990     InVT = MVT::i32;
20991     OutVT = MVT::i16;
20992   }
20993 
20994   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20995   if (SrcVT.is128BitVector()) {
20996     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20997     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20998     In = DAG.getBitcast(InVT, In);
20999     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21000     Res = extractSubVector(Res, 0, DAG, DL, 64);
21001     return DAG.getBitcast(DstVT, Res);
21002   }
21003 
21004   // Split lower/upper subvectors.
21005   SDValue Lo, Hi;
21006   std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21007 
21008   unsigned SubSizeInBits = SrcSizeInBits / 2;
21009   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21010   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21011 
21012   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21013   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21014     Lo = DAG.getBitcast(InVT, Lo);
21015     Hi = DAG.getBitcast(InVT, Hi);
21016     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21017     return DAG.getBitcast(DstVT, Res);
21018   }
21019 
21020   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21021   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21022   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21023     Lo = DAG.getBitcast(InVT, Lo);
21024     Hi = DAG.getBitcast(InVT, Hi);
21025     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21026 
21027     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21028     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21029     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21030     SmallVector<int, 64> Mask;
21031     int Scale = 64 / OutVT.getScalarSizeInBits();
21032     narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21033     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21034 
21035     if (DstVT.is256BitVector())
21036       return DAG.getBitcast(DstVT, Res);
21037 
21038     // If 512bit -> 128bit truncate another stage.
21039     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21040     Res = DAG.getBitcast(PackedVT, Res);
21041     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21042   }
21043 
21044   // Recursively pack lower/upper subvectors, concat result and pack again.
21045   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21046   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21047   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21048   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21049 
21050   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21051   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21052   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21053 }
21054 
21055 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21056                                   const X86Subtarget &Subtarget) {
21057 
21058   SDLoc DL(Op);
21059   MVT VT = Op.getSimpleValueType();
21060   SDValue In = Op.getOperand(0);
21061   MVT InVT = In.getSimpleValueType();
21062 
21063   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21064 
21065   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21066   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21067   if (InVT.getScalarSizeInBits() <= 16) {
21068     if (Subtarget.hasBWI()) {
21069       // legal, will go to VPMOVB2M, VPMOVW2M
21070       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21071         // We need to shift to get the lsb into sign position.
21072         // Shift packed bytes not supported natively, bitcast to word
21073         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21074         In = DAG.getNode(ISD::SHL, DL, ExtVT,
21075                          DAG.getBitcast(ExtVT, In),
21076                          DAG.getConstant(ShiftInx, DL, ExtVT));
21077         In = DAG.getBitcast(InVT, In);
21078       }
21079       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21080                           In, ISD::SETGT);
21081     }
21082     // Use TESTD/Q, extended vector to packed dword/qword.
21083     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21084            "Unexpected vector type.");
21085     unsigned NumElts = InVT.getVectorNumElements();
21086     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21087     // We need to change to a wider element type that we have support for.
21088     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21089     // For 16 element vectors we extend to v16i32 unless we are explicitly
21090     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21091     // we need to split into two 8 element vectors which we can extend to v8i32,
21092     // truncate and concat the results. There's an additional complication if
21093     // the original type is v16i8. In that case we can't split the v16i8
21094     // directly, so we need to shuffle high elements to low and use
21095     // sign_extend_vector_inreg.
21096     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21097       SDValue Lo, Hi;
21098       if (InVT == MVT::v16i8) {
21099         Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21100         Hi = DAG.getVectorShuffle(
21101             InVT, DL, In, In,
21102             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21103         Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21104       } else {
21105         assert(InVT == MVT::v16i16 && "Unexpected VT!");
21106         Lo = extract128BitVector(In, 0, DAG, DL);
21107         Hi = extract128BitVector(In, 8, DAG, DL);
21108       }
21109       // We're split now, just emit two truncates and a concat. The two
21110       // truncates will trigger legalization to come back to this function.
21111       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21112       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21113       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21114     }
21115     // We either have 8 elements or we're allowed to use 512-bit vectors.
21116     // If we have VLX, we want to use the narrowest vector that can get the
21117     // job done so we use vXi32.
21118     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21119     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21120     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21121     InVT = ExtVT;
21122     ShiftInx = InVT.getScalarSizeInBits() - 1;
21123   }
21124 
21125   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21126     // We need to shift to get the lsb into sign position.
21127     In = DAG.getNode(ISD::SHL, DL, InVT, In,
21128                      DAG.getConstant(ShiftInx, DL, InVT));
21129   }
21130   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21131   if (Subtarget.hasDQI())
21132     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21133   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21134 }
21135 
21136 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21137   SDLoc DL(Op);
21138   MVT VT = Op.getSimpleValueType();
21139   SDValue In = Op.getOperand(0);
21140   MVT InVT = In.getSimpleValueType();
21141   unsigned InNumEltBits = InVT.getScalarSizeInBits();
21142 
21143   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
21144          "Invalid TRUNCATE operation");
21145 
21146   // If we're called by the type legalizer, handle a few cases.
21147   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21148   if (!TLI.isTypeLegal(InVT)) {
21149     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21150         VT.is128BitVector()) {
21151       assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21152              "Unexpected subtarget!");
21153       // The default behavior is to truncate one step, concatenate, and then
21154       // truncate the remainder. We'd rather produce two 64-bit results and
21155       // concatenate those.
21156       SDValue Lo, Hi;
21157       std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21158 
21159       EVT LoVT, HiVT;
21160       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21161 
21162       Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21163       Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21164       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21165     }
21166 
21167     // Otherwise let default legalization handle it.
21168     return SDValue();
21169   }
21170 
21171   if (VT.getVectorElementType() == MVT::i1)
21172     return LowerTruncateVecI1(Op, DAG, Subtarget);
21173 
21174   // vpmovqb/w/d, vpmovdb/w, vpmovwb
21175   if (Subtarget.hasAVX512()) {
21176     if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21177       assert(VT == MVT::v32i8 && "Unexpected VT!");
21178       return splitVectorIntUnary(Op, DAG);
21179     }
21180 
21181     // word to byte only under BWI. Otherwise we have to promoted to v16i32
21182     // and then truncate that. But we should only do that if we haven't been
21183     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21184     // handled by isel patterns.
21185     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21186         Subtarget.canExtendTo512DQ())
21187       return Op;
21188   }
21189 
21190   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21191   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21192 
21193   // Truncate with PACKUS if we are truncating a vector with leading zero bits
21194   // that extend all the way to the packed/truncated value.
21195   // Pre-SSE41 we can only use PACKUSWB.
21196   KnownBits Known = DAG.computeKnownBits(In);
21197   if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21198     if (SDValue V =
21199             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21200       return V;
21201 
21202   // Truncate with PACKSS if we are truncating a vector with sign-bits that
21203   // extend all the way to the packed/truncated value.
21204   if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21205     if (SDValue V =
21206             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21207       return V;
21208 
21209   // Handle truncation of V256 to V128 using shuffles.
21210   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21211 
21212   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21213     In = DAG.getBitcast(MVT::v8i32, In);
21214 
21215     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21216     if (Subtarget.hasInt256()) {
21217       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21218       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21219       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21220                          DAG.getIntPtrConstant(0, DL));
21221     }
21222 
21223     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21224                                DAG.getIntPtrConstant(0, DL));
21225     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21226                                DAG.getIntPtrConstant(4, DL));
21227     static const int ShufMask[] = {0, 2, 4, 6};
21228     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21229   }
21230 
21231   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21232     In = DAG.getBitcast(MVT::v32i8, In);
21233 
21234     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21235     if (Subtarget.hasInt256()) {
21236       // The PSHUFB mask:
21237       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
21238                                       -1, -1, -1, -1, -1, -1, -1, -1,
21239                                       16, 17, 20, 21, 24, 25, 28, 29,
21240                                       -1, -1, -1, -1, -1, -1, -1, -1 };
21241       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21242       In = DAG.getBitcast(MVT::v4i64, In);
21243 
21244       static const int ShufMask2[] = {0, 2, -1, -1};
21245       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21246       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21247                          DAG.getBitcast(MVT::v16i16, In),
21248                          DAG.getIntPtrConstant(0, DL));
21249     }
21250 
21251     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21252                                DAG.getIntPtrConstant(0, DL));
21253     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21254                                DAG.getIntPtrConstant(16, DL));
21255 
21256     // The PSHUFB mask:
21257     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
21258                                    -1, -1, -1, -1, -1, -1, -1, -1};
21259 
21260     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21261     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21262 
21263     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21264     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21265 
21266     // The MOVLHPS Mask:
21267     static const int ShufMask2[] = {0, 1, 4, 5};
21268     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21269     return DAG.getBitcast(MVT::v8i16, res);
21270   }
21271 
21272   if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21273     // Use an AND to zero uppper bits for PACKUS.
21274     In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21275 
21276     SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21277                                DAG.getIntPtrConstant(0, DL));
21278     SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21279                                DAG.getIntPtrConstant(8, DL));
21280     return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21281   }
21282 
21283   llvm_unreachable("All 256->128 cases should have been handled above!");
21284 }
21285 
21286 // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21287 // behaves on out of range inputs to generate optimized conversions.
21288 static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21289                                     SelectionDAG &DAG,
21290                                     const X86Subtarget &Subtarget) {
21291   MVT SrcVT = Src.getSimpleValueType();
21292   unsigned DstBits = VT.getScalarSizeInBits();
21293   assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21294 
21295   // Calculate the converted result for values in the range 0 to
21296   // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21297   SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21298   SDValue Big =
21299       DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21300                   DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21301                               DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21302 
21303   // The "CVTTP2SI" instruction conveniently sets the sign bit if
21304   // and only if the value was out of range. So we can use that
21305   // as our indicator that we rather use "Big" instead of "Small".
21306   //
21307   // Use "Small" if "IsOverflown" has all bits cleared
21308   // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21309 
21310   // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21311   // use the slightly slower blendv select instead.
21312   if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21313     SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21314     return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21315   }
21316 
21317   SDValue IsOverflown =
21318       DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21319                   DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21320   return DAG.getNode(ISD::OR, dl, VT, Small,
21321                      DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21322 }
21323 
21324 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21325   bool IsStrict = Op->isStrictFPOpcode();
21326   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21327                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21328   MVT VT = Op->getSimpleValueType(0);
21329   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21330   MVT SrcVT = Src.getSimpleValueType();
21331   SDLoc dl(Op);
21332 
21333   if (VT.isVector()) {
21334     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21335       MVT ResVT = MVT::v4i32;
21336       MVT TruncVT = MVT::v4i1;
21337       unsigned Opc;
21338       if (IsStrict)
21339         Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21340       else
21341         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21342 
21343       if (!IsSigned && !Subtarget.hasVLX()) {
21344         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21345         // Widen to 512-bits.
21346         ResVT = MVT::v8i32;
21347         TruncVT = MVT::v8i1;
21348         Opc = Op.getOpcode();
21349         // Need to concat with zero vector for strict fp to avoid spurious
21350         // exceptions.
21351         // TODO: Should we just do this for non-strict as well?
21352         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21353                                : DAG.getUNDEF(MVT::v8f64);
21354         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21355                           DAG.getIntPtrConstant(0, dl));
21356       }
21357       SDValue Res, Chain;
21358       if (IsStrict) {
21359         Res =
21360             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21361         Chain = Res.getValue(1);
21362       } else {
21363         Res = DAG.getNode(Opc, dl, ResVT, Src);
21364       }
21365 
21366       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21367       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21368                         DAG.getIntPtrConstant(0, dl));
21369       if (IsStrict)
21370         return DAG.getMergeValues({Res, Chain}, dl);
21371       return Res;
21372     }
21373 
21374     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21375     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21376       assert(!IsSigned && "Expected unsigned conversion!");
21377       assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21378       return Op;
21379     }
21380 
21381     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21382     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21383         (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21384         Subtarget.useAVX512Regs()) {
21385       assert(!IsSigned && "Expected unsigned conversion!");
21386       assert(!Subtarget.hasVLX() && "Unexpected features!");
21387       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21388       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21389       // Need to concat with zero vector for strict fp to avoid spurious
21390       // exceptions.
21391       // TODO: Should we just do this for non-strict as well?
21392       SDValue Tmp =
21393           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21394       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21395                         DAG.getIntPtrConstant(0, dl));
21396 
21397       SDValue Res, Chain;
21398       if (IsStrict) {
21399         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21400                           {Op->getOperand(0), Src});
21401         Chain = Res.getValue(1);
21402       } else {
21403         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21404       }
21405 
21406       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21407                         DAG.getIntPtrConstant(0, dl));
21408 
21409       if (IsStrict)
21410         return DAG.getMergeValues({Res, Chain}, dl);
21411       return Res;
21412     }
21413 
21414     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21415     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21416         (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21417         Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21418       assert(!Subtarget.hasVLX() && "Unexpected features!");
21419       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21420       // Need to concat with zero vector for strict fp to avoid spurious
21421       // exceptions.
21422       // TODO: Should we just do this for non-strict as well?
21423       SDValue Tmp =
21424           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21425       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21426                         DAG.getIntPtrConstant(0, dl));
21427 
21428       SDValue Res, Chain;
21429       if (IsStrict) {
21430         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21431                           {Op->getOperand(0), Src});
21432         Chain = Res.getValue(1);
21433       } else {
21434         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21435       }
21436 
21437       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21438                         DAG.getIntPtrConstant(0, dl));
21439 
21440       if (IsStrict)
21441         return DAG.getMergeValues({Res, Chain}, dl);
21442       return Res;
21443     }
21444 
21445     if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21446       if (!Subtarget.hasVLX()) {
21447         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21448         // legalizer and then widened again by vector op legalization.
21449         if (!IsStrict)
21450           return SDValue();
21451 
21452         SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21453         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21454                                   {Src, Zero, Zero, Zero});
21455         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21456                           {Op->getOperand(0), Tmp});
21457         SDValue Chain = Tmp.getValue(1);
21458         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21459                           DAG.getIntPtrConstant(0, dl));
21460         return DAG.getMergeValues({Tmp, Chain}, dl);
21461       }
21462 
21463       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21464       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21465                                 DAG.getUNDEF(MVT::v2f32));
21466       if (IsStrict) {
21467         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21468                                 : X86ISD::STRICT_CVTTP2UI;
21469         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21470       }
21471       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21472       return DAG.getNode(Opc, dl, VT, Tmp);
21473     }
21474 
21475     // Generate optimized instructions for pre AVX512 unsigned conversions from
21476     // vXf32 to vXi32.
21477     if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21478         (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21479         (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21480       assert(!IsSigned && "Expected unsigned conversion!");
21481       return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21482     }
21483 
21484     return SDValue();
21485   }
21486 
21487   assert(!VT.isVector());
21488 
21489   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21490 
21491   if (!IsSigned && UseSSEReg) {
21492     // Conversions from f32/f64 with AVX512 should be legal.
21493     if (Subtarget.hasAVX512())
21494       return Op;
21495 
21496     // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21497     // behaves on out of range inputs to generate optimized conversions.
21498     if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21499                       (VT == MVT::i64 && Subtarget.is64Bit()))) {
21500       unsigned DstBits = VT.getScalarSizeInBits();
21501       APInt UIntLimit = APInt::getSignMask(DstBits);
21502       SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21503                                         DAG.getConstant(UIntLimit, dl, VT));
21504       MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21505 
21506       // Calculate the converted result for values in the range:
21507       // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21508       // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21509       SDValue Small =
21510           DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21511                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21512       SDValue Big = DAG.getNode(
21513           X86ISD::CVTTS2SI, dl, VT,
21514           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21515                       DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21516 
21517       // The "CVTTS2SI" instruction conveniently sets the sign bit if
21518       // and only if the value was out of range. So we can use that
21519       // as our indicator that we rather use "Big" instead of "Small".
21520       //
21521       // Use "Small" if "IsOverflown" has all bits cleared
21522       // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21523       SDValue IsOverflown = DAG.getNode(
21524           ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21525       return DAG.getNode(ISD::OR, dl, VT, Small,
21526                          DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21527     }
21528 
21529     // Use default expansion for i64.
21530     if (VT == MVT::i64)
21531       return SDValue();
21532 
21533     assert(VT == MVT::i32 && "Unexpected VT!");
21534 
21535     // Promote i32 to i64 and use a signed operation on 64-bit targets.
21536     // FIXME: This does not generate an invalid exception if the input does not
21537     // fit in i32. PR44019
21538     if (Subtarget.is64Bit()) {
21539       SDValue Res, Chain;
21540       if (IsStrict) {
21541         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21542                           { Op.getOperand(0), Src });
21543         Chain = Res.getValue(1);
21544       } else
21545         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21546 
21547       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21548       if (IsStrict)
21549         return DAG.getMergeValues({ Res, Chain }, dl);
21550       return Res;
21551     }
21552 
21553     // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21554     // use fisttp which will be handled later.
21555     if (!Subtarget.hasSSE3())
21556       return SDValue();
21557   }
21558 
21559   // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21560   // FIXME: This does not generate an invalid exception if the input does not
21561   // fit in i16. PR44019
21562   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21563     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21564     SDValue Res, Chain;
21565     if (IsStrict) {
21566       Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21567                         { Op.getOperand(0), Src });
21568       Chain = Res.getValue(1);
21569     } else
21570       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21571 
21572     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21573     if (IsStrict)
21574       return DAG.getMergeValues({ Res, Chain }, dl);
21575     return Res;
21576   }
21577 
21578   // If this is a FP_TO_SINT using SSEReg we're done.
21579   if (UseSSEReg && IsSigned)
21580     return Op;
21581 
21582   // fp128 needs to use a libcall.
21583   if (SrcVT == MVT::f128) {
21584     RTLIB::Libcall LC;
21585     if (IsSigned)
21586       LC = RTLIB::getFPTOSINT(SrcVT, VT);
21587     else
21588       LC = RTLIB::getFPTOUINT(SrcVT, VT);
21589 
21590     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21591     MakeLibCallOptions CallOptions;
21592     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21593                                                   SDLoc(Op), Chain);
21594 
21595     if (IsStrict)
21596       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21597 
21598     return Tmp.first;
21599   }
21600 
21601   // Fall back to X87.
21602   SDValue Chain;
21603   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21604     if (IsStrict)
21605       return DAG.getMergeValues({V, Chain}, dl);
21606     return V;
21607   }
21608 
21609   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21610 }
21611 
21612 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21613                                              SelectionDAG &DAG) const {
21614   SDValue Src = Op.getOperand(0);
21615   MVT SrcVT = Src.getSimpleValueType();
21616 
21617   // If the source is in an SSE register, the node is Legal.
21618   if (isScalarFPTypeInSSEReg(SrcVT))
21619     return Op;
21620 
21621   return LRINT_LLRINTHelper(Op.getNode(), DAG);
21622 }
21623 
21624 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21625                                               SelectionDAG &DAG) const {
21626   EVT DstVT = N->getValueType(0);
21627   SDValue Src = N->getOperand(0);
21628   EVT SrcVT = Src.getValueType();
21629 
21630   if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21631     // f16 must be promoted before using the lowering in this routine.
21632     // fp128 does not use this lowering.
21633     return SDValue();
21634   }
21635 
21636   SDLoc DL(N);
21637   SDValue Chain = DAG.getEntryNode();
21638 
21639   bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21640 
21641   // If we're converting from SSE, the stack slot needs to hold both types.
21642   // Otherwise it only needs to hold the DstVT.
21643   EVT OtherVT = UseSSE ? SrcVT : DstVT;
21644   SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21645   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21646   MachinePointerInfo MPI =
21647       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21648 
21649   if (UseSSE) {
21650     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21651     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21652     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21653     SDValue Ops[] = { Chain, StackPtr };
21654 
21655     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21656                                   /*Align*/ None, MachineMemOperand::MOLoad);
21657     Chain = Src.getValue(1);
21658   }
21659 
21660   SDValue StoreOps[] = { Chain, Src, StackPtr };
21661   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21662                                   StoreOps, DstVT, MPI, /*Align*/ None,
21663                                   MachineMemOperand::MOStore);
21664 
21665   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21666 }
21667 
21668 SDValue
21669 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21670   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21671   // but making use of X86 specifics to produce better instruction sequences.
21672   SDNode *Node = Op.getNode();
21673   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21674   unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21675   SDLoc dl(SDValue(Node, 0));
21676   SDValue Src = Node->getOperand(0);
21677 
21678   // There are three types involved here: SrcVT is the source floating point
21679   // type, DstVT is the type of the result, and TmpVT is the result of the
21680   // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21681   // DstVT).
21682   EVT SrcVT = Src.getValueType();
21683   EVT DstVT = Node->getValueType(0);
21684   EVT TmpVT = DstVT;
21685 
21686   // This code is only for floats and doubles. Fall back to generic code for
21687   // anything else.
21688   if (!isScalarFPTypeInSSEReg(SrcVT))
21689     return SDValue();
21690 
21691   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21692   unsigned SatWidth = SatVT.getScalarSizeInBits();
21693   unsigned DstWidth = DstVT.getScalarSizeInBits();
21694   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21695   assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21696          "Expected saturation width smaller than result width");
21697 
21698   // Promote result of FP_TO_*INT to at least 32 bits.
21699   if (TmpWidth < 32) {
21700     TmpVT = MVT::i32;
21701     TmpWidth = 32;
21702   }
21703 
21704   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21705   // us to use a native signed conversion instead.
21706   if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21707     TmpVT = MVT::i64;
21708     TmpWidth = 64;
21709   }
21710 
21711   // If the saturation width is smaller than the size of the temporary result,
21712   // we can always use signed conversion, which is native.
21713   if (SatWidth < TmpWidth)
21714     FpToIntOpcode = ISD::FP_TO_SINT;
21715 
21716   // Determine minimum and maximum integer values and their corresponding
21717   // floating-point values.
21718   APInt MinInt, MaxInt;
21719   if (IsSigned) {
21720     MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21721     MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21722   } else {
21723     MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21724     MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21725   }
21726 
21727   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21728   APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21729 
21730   APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21731     MinInt, IsSigned, APFloat::rmTowardZero);
21732   APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21733     MaxInt, IsSigned, APFloat::rmTowardZero);
21734   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21735                           && !(MaxStatus & APFloat::opStatus::opInexact);
21736 
21737   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21738   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21739 
21740   // If the integer bounds are exactly representable as floats, emit a
21741   // min+max+fptoi sequence. Otherwise use comparisons and selects.
21742   if (AreExactFloatBounds) {
21743     if (DstVT != TmpVT) {
21744       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21745       SDValue MinClamped = DAG.getNode(
21746         X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21747       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21748       SDValue BothClamped = DAG.getNode(
21749         X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21750       // Convert clamped value to integer.
21751       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21752 
21753       // NaN will become INDVAL, with the top bit set and the rest zero.
21754       // Truncation will discard the top bit, resulting in zero.
21755       return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21756     }
21757 
21758     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21759     SDValue MinClamped = DAG.getNode(
21760       X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21761     // Clamp by MaxFloat from above. NaN cannot occur.
21762     SDValue BothClamped = DAG.getNode(
21763       X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21764     // Convert clamped value to integer.
21765     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21766 
21767     if (!IsSigned) {
21768       // In the unsigned case we're done, because we mapped NaN to MinFloat,
21769       // which is zero.
21770       return FpToInt;
21771     }
21772 
21773     // Otherwise, select zero if Src is NaN.
21774     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21775     return DAG.getSelectCC(
21776       dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21777   }
21778 
21779   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21780   SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21781 
21782   // Result of direct conversion, which may be selected away.
21783   SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21784 
21785   if (DstVT != TmpVT) {
21786     // NaN will become INDVAL, with the top bit set and the rest zero.
21787     // Truncation will discard the top bit, resulting in zero.
21788     FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21789   }
21790 
21791   SDValue Select = FpToInt;
21792   // For signed conversions where we saturate to the same size as the
21793   // result type of the fptoi instructions, INDVAL coincides with integer
21794   // minimum, so we don't need to explicitly check it.
21795   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21796     // If Src ULT MinFloat, select MinInt. In particular, this also selects
21797     // MinInt if Src is NaN.
21798     Select = DAG.getSelectCC(
21799       dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21800   }
21801 
21802   // If Src OGT MaxFloat, select MaxInt.
21803   Select = DAG.getSelectCC(
21804     dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21805 
21806   // In the unsigned case we are done, because we mapped NaN to MinInt, which
21807   // is already zero. The promoted case was already handled above.
21808   if (!IsSigned || DstVT != TmpVT) {
21809     return Select;
21810   }
21811 
21812   // Otherwise, select 0 if Src is NaN.
21813   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21814   return DAG.getSelectCC(
21815     dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21816 }
21817 
21818 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21819   bool IsStrict = Op->isStrictFPOpcode();
21820 
21821   SDLoc DL(Op);
21822   MVT VT = Op.getSimpleValueType();
21823   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21824   MVT SVT = In.getSimpleValueType();
21825 
21826   if (VT == MVT::f128)
21827     return SDValue();
21828 
21829   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21830 
21831   SDValue Res =
21832       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21833   if (IsStrict)
21834     return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21835                        {Op->getOperand(0), Res});
21836   return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21837 }
21838 
21839 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21840   bool IsStrict = Op->isStrictFPOpcode();
21841   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21842   // It's legal except when f128 is involved
21843   if (In.getSimpleValueType() != MVT::f128)
21844     return Op;
21845 
21846   return SDValue();
21847 }
21848 
21849 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21850   bool IsStrict = Op->isStrictFPOpcode();
21851   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21852   assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21853          "Unexpected VT!");
21854 
21855   SDLoc dl(Op);
21856   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21857                             DAG.getConstant(0, dl, MVT::v8i16), Src,
21858                             DAG.getIntPtrConstant(0, dl));
21859 
21860   SDValue Chain;
21861   if (IsStrict) {
21862     Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21863                       {Op.getOperand(0), Res});
21864     Chain = Res.getValue(1);
21865   } else {
21866     Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21867   }
21868 
21869   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21870                     DAG.getIntPtrConstant(0, dl));
21871 
21872   if (IsStrict)
21873     return DAG.getMergeValues({Res, Chain}, dl);
21874 
21875   return Res;
21876 }
21877 
21878 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21879   bool IsStrict = Op->isStrictFPOpcode();
21880   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21881   assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21882          "Unexpected VT!");
21883 
21884   SDLoc dl(Op);
21885   SDValue Res, Chain;
21886   if (IsStrict) {
21887     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21888                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21889                       DAG.getIntPtrConstant(0, dl));
21890     Res = DAG.getNode(
21891         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21892         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21893     Chain = Res.getValue(1);
21894   } else {
21895     // FIXME: Should we use zeros for upper elements for non-strict?
21896     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21897     Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21898                       DAG.getTargetConstant(4, dl, MVT::i32));
21899   }
21900 
21901   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21902                     DAG.getIntPtrConstant(0, dl));
21903 
21904   if (IsStrict)
21905     return DAG.getMergeValues({Res, Chain}, dl);
21906 
21907   return Res;
21908 }
21909 
21910 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21911 /// vector operation in place of the typical scalar operation.
21912 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21913                                          const X86Subtarget &Subtarget) {
21914   // If both operands have other uses, this is probably not profitable.
21915   SDValue LHS = Op.getOperand(0);
21916   SDValue RHS = Op.getOperand(1);
21917   if (!LHS.hasOneUse() && !RHS.hasOneUse())
21918     return Op;
21919 
21920   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21921   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21922   if (IsFP && !Subtarget.hasSSE3())
21923     return Op;
21924   if (!IsFP && !Subtarget.hasSSSE3())
21925     return Op;
21926 
21927   // Extract from a common vector.
21928   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21929       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21930       LHS.getOperand(0) != RHS.getOperand(0) ||
21931       !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21932       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21933       !shouldUseHorizontalOp(true, DAG, Subtarget))
21934     return Op;
21935 
21936   // Allow commuted 'hadd' ops.
21937   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21938   unsigned HOpcode;
21939   switch (Op.getOpcode()) {
21940     case ISD::ADD: HOpcode = X86ISD::HADD; break;
21941     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21942     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21943     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21944     default:
21945       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21946   }
21947   unsigned LExtIndex = LHS.getConstantOperandVal(1);
21948   unsigned RExtIndex = RHS.getConstantOperandVal(1);
21949   if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21950       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21951     std::swap(LExtIndex, RExtIndex);
21952 
21953   if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21954     return Op;
21955 
21956   SDValue X = LHS.getOperand(0);
21957   EVT VecVT = X.getValueType();
21958   unsigned BitWidth = VecVT.getSizeInBits();
21959   unsigned NumLanes = BitWidth / 128;
21960   unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21961   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21962          "Not expecting illegal vector widths here");
21963 
21964   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21965   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21966   SDLoc DL(Op);
21967   if (BitWidth == 256 || BitWidth == 512) {
21968     unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21969     X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21970     LExtIndex %= NumEltsPerLane;
21971   }
21972 
21973   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21974   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21975   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21976   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21977   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21978   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21979                      DAG.getIntPtrConstant(LExtIndex / 2, DL));
21980 }
21981 
21982 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21983 /// vector operation in place of the typical scalar operation.
21984 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21985   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21986          "Only expecting float/double");
21987   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21988 }
21989 
21990 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21991 /// This mode isn't supported in hardware on X86. But as long as we aren't
21992 /// compiling with trapping math, we can emulate this with
21993 /// floor(X + copysign(nextafter(0.5, 0.0), X)).
21994 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21995   SDValue N0 = Op.getOperand(0);
21996   SDLoc dl(Op);
21997   MVT VT = Op.getSimpleValueType();
21998 
21999   // N0 += copysign(nextafter(0.5, 0.0), N0)
22000   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22001   bool Ignored;
22002   APFloat Point5Pred = APFloat(0.5f);
22003   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22004   Point5Pred.next(/*nextDown*/true);
22005 
22006   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22007                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
22008   N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22009 
22010   // Truncate the result to remove fraction.
22011   return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22012 }
22013 
22014 /// The only differences between FABS and FNEG are the mask and the logic op.
22015 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
22016 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22017   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22018          "Wrong opcode for lowering FABS or FNEG.");
22019 
22020   bool IsFABS = (Op.getOpcode() == ISD::FABS);
22021 
22022   // If this is a FABS and it has an FNEG user, bail out to fold the combination
22023   // into an FNABS. We'll lower the FABS after that if it is still in use.
22024   if (IsFABS)
22025     for (SDNode *User : Op->uses())
22026       if (User->getOpcode() == ISD::FNEG)
22027         return Op;
22028 
22029   SDLoc dl(Op);
22030   MVT VT = Op.getSimpleValueType();
22031 
22032   bool IsF128 = (VT == MVT::f128);
22033   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
22034           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
22035           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
22036          "Unexpected type in LowerFABSorFNEG");
22037 
22038   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22039   // decide if we should generate a 16-byte constant mask when we only need 4 or
22040   // 8 bytes for the scalar case.
22041 
22042   // There are no scalar bitwise logical SSE/AVX instructions, so we
22043   // generate a 16-byte vector constant and logic op even for the scalar case.
22044   // Using a 16-byte mask allows folding the load of the mask with
22045   // the logic op, so it can save (~4 bytes) on code size.
22046   bool IsFakeVector = !VT.isVector() && !IsF128;
22047   MVT LogicVT = VT;
22048   if (IsFakeVector)
22049     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22050 
22051   unsigned EltBits = VT.getScalarSizeInBits();
22052   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22053   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22054                            APInt::getSignMask(EltBits);
22055   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22056   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22057 
22058   SDValue Op0 = Op.getOperand(0);
22059   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22060   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
22061                      IsFNABS ? X86ISD::FOR  :
22062                                X86ISD::FXOR;
22063   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22064 
22065   if (VT.isVector() || IsF128)
22066     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22067 
22068   // For the scalar case extend to a 128-bit vector, perform the logic op,
22069   // and extract the scalar result back out.
22070   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22071   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22072   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22073                      DAG.getIntPtrConstant(0, dl));
22074 }
22075 
22076 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22077   SDValue Mag = Op.getOperand(0);
22078   SDValue Sign = Op.getOperand(1);
22079   SDLoc dl(Op);
22080 
22081   // If the sign operand is smaller, extend it first.
22082   MVT VT = Op.getSimpleValueType();
22083   if (Sign.getSimpleValueType().bitsLT(VT))
22084     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22085 
22086   // And if it is bigger, shrink it first.
22087   if (Sign.getSimpleValueType().bitsGT(VT))
22088     Sign =
22089         DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22090 
22091   // At this point the operands and the result should have the same
22092   // type, and that won't be f80 since that is not custom lowered.
22093   bool IsF128 = (VT == MVT::f128);
22094   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
22095           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
22096           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
22097          "Unexpected type in LowerFCOPYSIGN");
22098 
22099   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22100 
22101   // Perform all scalar logic operations as 16-byte vectors because there are no
22102   // scalar FP logic instructions in SSE.
22103   // TODO: This isn't necessary. If we used scalar types, we might avoid some
22104   // unnecessary splats, but we might miss load folding opportunities. Should
22105   // this decision be based on OptimizeForSize?
22106   bool IsFakeVector = !VT.isVector() && !IsF128;
22107   MVT LogicVT = VT;
22108   if (IsFakeVector)
22109     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22110 
22111   // The mask constants are automatically splatted for vector types.
22112   unsigned EltSizeInBits = VT.getScalarSizeInBits();
22113   SDValue SignMask = DAG.getConstantFP(
22114       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22115   SDValue MagMask = DAG.getConstantFP(
22116       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22117 
22118   // First, clear all bits but the sign bit from the second operand (sign).
22119   if (IsFakeVector)
22120     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22121   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22122 
22123   // Next, clear the sign bit from the first operand (magnitude).
22124   // TODO: If we had general constant folding for FP logic ops, this check
22125   // wouldn't be necessary.
22126   SDValue MagBits;
22127   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22128     APFloat APF = Op0CN->getValueAPF();
22129     APF.clearSign();
22130     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22131   } else {
22132     // If the magnitude operand wasn't a constant, we need to AND out the sign.
22133     if (IsFakeVector)
22134       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22135     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22136   }
22137 
22138   // OR the magnitude value with the sign bit.
22139   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22140   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22141                                           DAG.getIntPtrConstant(0, dl));
22142 }
22143 
22144 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22145   SDValue N0 = Op.getOperand(0);
22146   SDLoc dl(Op);
22147   MVT VT = Op.getSimpleValueType();
22148 
22149   MVT OpVT = N0.getSimpleValueType();
22150   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22151          "Unexpected type for FGETSIGN");
22152 
22153   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22154   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22155   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22156   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22157   Res = DAG.getZExtOrTrunc(Res, dl, VT);
22158   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22159   return Res;
22160 }
22161 
22162 /// Helper for creating a X86ISD::SETCC node.
22163 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22164                         SelectionDAG &DAG) {
22165   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22166                      DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22167 }
22168 
22169 /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22170 /// style scalarized (associative) reduction patterns. Partial reductions
22171 /// are supported when the pointer SrcMask is non-null.
22172 /// TODO - move this to SelectionDAG?
22173 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22174                                  SmallVectorImpl<SDValue> &SrcOps,
22175                                  SmallVectorImpl<APInt> *SrcMask = nullptr) {
22176   SmallVector<SDValue, 8> Opnds;
22177   DenseMap<SDValue, APInt> SrcOpMap;
22178   EVT VT = MVT::Other;
22179 
22180   // Recognize a special case where a vector is casted into wide integer to
22181   // test all 0s.
22182   assert(Op.getOpcode() == unsigned(BinOp) &&
22183          "Unexpected bit reduction opcode");
22184   Opnds.push_back(Op.getOperand(0));
22185   Opnds.push_back(Op.getOperand(1));
22186 
22187   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22188     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22189     // BFS traverse all BinOp operands.
22190     if (I->getOpcode() == unsigned(BinOp)) {
22191       Opnds.push_back(I->getOperand(0));
22192       Opnds.push_back(I->getOperand(1));
22193       // Re-evaluate the number of nodes to be traversed.
22194       e += 2; // 2 more nodes (LHS and RHS) are pushed.
22195       continue;
22196     }
22197 
22198     // Quit if a non-EXTRACT_VECTOR_ELT
22199     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22200       return false;
22201 
22202     // Quit if without a constant index.
22203     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22204     if (!Idx)
22205       return false;
22206 
22207     SDValue Src = I->getOperand(0);
22208     DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22209     if (M == SrcOpMap.end()) {
22210       VT = Src.getValueType();
22211       // Quit if not the same type.
22212       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22213         return false;
22214       unsigned NumElts = VT.getVectorNumElements();
22215       APInt EltCount = APInt::getNullValue(NumElts);
22216       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22217       SrcOps.push_back(Src);
22218     }
22219 
22220     // Quit if element already used.
22221     unsigned CIdx = Idx->getZExtValue();
22222     if (M->second[CIdx])
22223       return false;
22224     M->second.setBit(CIdx);
22225   }
22226 
22227   if (SrcMask) {
22228     // Collect the source partial masks.
22229     for (SDValue &SrcOp : SrcOps)
22230       SrcMask->push_back(SrcOpMap[SrcOp]);
22231   } else {
22232     // Quit if not all elements are used.
22233     for (const auto &I : SrcOpMap)
22234       if (!I.second.isAllOnesValue())
22235         return false;
22236   }
22237 
22238   return true;
22239 }
22240 
22241 // Helper function for comparing all bits of a vector against zero.
22242 static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22243                                   const APInt &Mask,
22244                                   const X86Subtarget &Subtarget,
22245                                   SelectionDAG &DAG, X86::CondCode &X86CC) {
22246   EVT VT = V.getValueType();
22247   unsigned ScalarSize = VT.getScalarSizeInBits();
22248   if (Mask.getBitWidth() != ScalarSize) {
22249     assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22250     return SDValue();
22251   }
22252 
22253   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22254   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22255 
22256   auto MaskBits = [&](SDValue Src) {
22257     if (Mask.isAllOnesValue())
22258       return Src;
22259     EVT SrcVT = Src.getValueType();
22260     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22261     return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22262   };
22263 
22264   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22265   if (VT.getSizeInBits() < 128) {
22266     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22267     if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22268       return SDValue();
22269     return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22270                        DAG.getBitcast(IntVT, MaskBits(V)),
22271                        DAG.getConstant(0, DL, IntVT));
22272   }
22273 
22274   // Quit if not splittable to 128/256-bit vector.
22275   if (!isPowerOf2_32(VT.getSizeInBits()))
22276     return SDValue();
22277 
22278   // Split down to 128/256-bit vector.
22279   unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22280   while (VT.getSizeInBits() > TestSize) {
22281     auto Split = DAG.SplitVector(V, DL);
22282     VT = Split.first.getValueType();
22283     V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22284   }
22285 
22286   bool UsePTEST = Subtarget.hasSSE41();
22287   if (UsePTEST) {
22288     MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22289     V = DAG.getBitcast(TestVT, MaskBits(V));
22290     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22291   }
22292 
22293   // Without PTEST, a masked v2i64 or-reduction is not faster than
22294   // scalarization.
22295   if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22296       return SDValue();
22297 
22298   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22299   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22300                   getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22301   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22302   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22303                      DAG.getConstant(0xFFFF, DL, MVT::i32));
22304 }
22305 
22306 // Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22307 // CMP(MOVMSK(PCMPEQB(X,0))).
22308 static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22309                                       const SDLoc &DL,
22310                                       const X86Subtarget &Subtarget,
22311                                       SelectionDAG &DAG, SDValue &X86CC) {
22312   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22313 
22314   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22315     return SDValue();
22316 
22317   // Check whether we're masking/truncating an OR-reduction result, in which
22318   // case track the masked bits.
22319   APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22320   switch (Op.getOpcode()) {
22321   case ISD::TRUNCATE: {
22322     SDValue Src = Op.getOperand(0);
22323     Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22324                                 Op.getScalarValueSizeInBits());
22325     Op = Src;
22326     break;
22327   }
22328   case ISD::AND: {
22329     if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22330       Mask = Cst->getAPIntValue();
22331       Op = Op.getOperand(0);
22332     }
22333     break;
22334   }
22335   }
22336 
22337   SmallVector<SDValue, 8> VecIns;
22338   if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22339     EVT VT = VecIns[0].getValueType();
22340     assert(llvm::all_of(VecIns,
22341                         [VT](SDValue V) { return VT == V.getValueType(); }) &&
22342            "Reduction source vector mismatch");
22343 
22344     // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22345     if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22346       return SDValue();
22347 
22348     // If more than one full vector is evaluated, OR them first before PTEST.
22349     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22350          Slot += 2, e += 1) {
22351       // Each iteration will OR 2 nodes and append the result until there is
22352       // only 1 node left, i.e. the final OR'd value of all vectors.
22353       SDValue LHS = VecIns[Slot];
22354       SDValue RHS = VecIns[Slot + 1];
22355       VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22356     }
22357 
22358     X86::CondCode CCode;
22359     if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22360                                        DAG, CCode)) {
22361       X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22362       return V;
22363     }
22364   }
22365 
22366   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22367     ISD::NodeType BinOp;
22368     if (SDValue Match =
22369             DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22370       X86::CondCode CCode;
22371       if (SDValue V =
22372               LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22373         X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22374         return V;
22375       }
22376     }
22377   }
22378 
22379   return SDValue();
22380 }
22381 
22382 /// return true if \c Op has a use that doesn't just read flags.
22383 static bool hasNonFlagsUse(SDValue Op) {
22384   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22385        ++UI) {
22386     SDNode *User = *UI;
22387     unsigned UOpNo = UI.getOperandNo();
22388     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22389       // Look pass truncate.
22390       UOpNo = User->use_begin().getOperandNo();
22391       User = *User->use_begin();
22392     }
22393 
22394     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22395         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22396       return true;
22397   }
22398   return false;
22399 }
22400 
22401 // Transform to an x86-specific ALU node with flags if there is a chance of
22402 // using an RMW op or only the flags are used. Otherwise, leave
22403 // the node alone and emit a 'cmp' or 'test' instruction.
22404 static bool isProfitableToUseFlagOp(SDValue Op) {
22405   for (SDNode *U : Op->uses())
22406     if (U->getOpcode() != ISD::CopyToReg &&
22407         U->getOpcode() != ISD::SETCC &&
22408         U->getOpcode() != ISD::STORE)
22409       return false;
22410 
22411   return true;
22412 }
22413 
22414 /// Emit nodes that will be selected as "test Op0,Op0", or something
22415 /// equivalent.
22416 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22417                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22418   // CF and OF aren't always set the way we want. Determine which
22419   // of these we need.
22420   bool NeedCF = false;
22421   bool NeedOF = false;
22422   switch (X86CC) {
22423   default: break;
22424   case X86::COND_A: case X86::COND_AE:
22425   case X86::COND_B: case X86::COND_BE:
22426     NeedCF = true;
22427     break;
22428   case X86::COND_G: case X86::COND_GE:
22429   case X86::COND_L: case X86::COND_LE:
22430   case X86::COND_O: case X86::COND_NO: {
22431     // Check if we really need to set the
22432     // Overflow flag. If NoSignedWrap is present
22433     // that is not actually needed.
22434     switch (Op->getOpcode()) {
22435     case ISD::ADD:
22436     case ISD::SUB:
22437     case ISD::MUL:
22438     case ISD::SHL:
22439       if (Op.getNode()->getFlags().hasNoSignedWrap())
22440         break;
22441       LLVM_FALLTHROUGH;
22442     default:
22443       NeedOF = true;
22444       break;
22445     }
22446     break;
22447   }
22448   }
22449   // See if we can use the EFLAGS value from the operand instead of
22450   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22451   // we prove that the arithmetic won't overflow, we can't use OF or CF.
22452   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22453     // Emit a CMP with 0, which is the TEST pattern.
22454     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22455                        DAG.getConstant(0, dl, Op.getValueType()));
22456   }
22457   unsigned Opcode = 0;
22458   unsigned NumOperands = 0;
22459 
22460   SDValue ArithOp = Op;
22461 
22462   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22463   // which may be the result of a CAST.  We use the variable 'Op', which is the
22464   // non-casted variable when we check for possible users.
22465   switch (ArithOp.getOpcode()) {
22466   case ISD::AND:
22467     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22468     // because a TEST instruction will be better.
22469     if (!hasNonFlagsUse(Op))
22470       break;
22471 
22472     LLVM_FALLTHROUGH;
22473   case ISD::ADD:
22474   case ISD::SUB:
22475   case ISD::OR:
22476   case ISD::XOR:
22477     if (!isProfitableToUseFlagOp(Op))
22478       break;
22479 
22480     // Otherwise use a regular EFLAGS-setting instruction.
22481     switch (ArithOp.getOpcode()) {
22482     default: llvm_unreachable("unexpected operator!");
22483     case ISD::ADD: Opcode = X86ISD::ADD; break;
22484     case ISD::SUB: Opcode = X86ISD::SUB; break;
22485     case ISD::XOR: Opcode = X86ISD::XOR; break;
22486     case ISD::AND: Opcode = X86ISD::AND; break;
22487     case ISD::OR:  Opcode = X86ISD::OR;  break;
22488     }
22489 
22490     NumOperands = 2;
22491     break;
22492   case X86ISD::ADD:
22493   case X86ISD::SUB:
22494   case X86ISD::OR:
22495   case X86ISD::XOR:
22496   case X86ISD::AND:
22497     return SDValue(Op.getNode(), 1);
22498   case ISD::SSUBO:
22499   case ISD::USUBO: {
22500     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22501     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22502     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22503                        Op->getOperand(1)).getValue(1);
22504   }
22505   default:
22506     break;
22507   }
22508 
22509   if (Opcode == 0) {
22510     // Emit a CMP with 0, which is the TEST pattern.
22511     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22512                        DAG.getConstant(0, dl, Op.getValueType()));
22513   }
22514   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22515   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22516 
22517   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22518   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22519   return SDValue(New.getNode(), 1);
22520 }
22521 
22522 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
22523 /// equivalent.
22524 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22525                        const SDLoc &dl, SelectionDAG &DAG,
22526                        const X86Subtarget &Subtarget) {
22527   if (isNullConstant(Op1))
22528     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22529 
22530   EVT CmpVT = Op0.getValueType();
22531 
22532   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22533           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22534 
22535   // Only promote the compare up to I32 if it is a 16 bit operation
22536   // with an immediate.  16 bit immediates are to be avoided.
22537   if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22538       !DAG.getMachineFunction().getFunction().hasMinSize()) {
22539     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22540     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22541     // Don't do this if the immediate can fit in 8-bits.
22542     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22543         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22544       unsigned ExtendOp =
22545           isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22546       if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22547         // For equality comparisons try to use SIGN_EXTEND if the input was
22548         // truncate from something with enough sign bits.
22549         if (Op0.getOpcode() == ISD::TRUNCATE) {
22550           SDValue In = Op0.getOperand(0);
22551           unsigned EffBits =
22552               In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22553           if (EffBits <= 16)
22554             ExtendOp = ISD::SIGN_EXTEND;
22555         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22556           SDValue In = Op1.getOperand(0);
22557           unsigned EffBits =
22558               In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22559           if (EffBits <= 16)
22560             ExtendOp = ISD::SIGN_EXTEND;
22561         }
22562       }
22563 
22564       CmpVT = MVT::i32;
22565       Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22566       Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22567     }
22568   }
22569 
22570   // Try to shrink i64 compares if the input has enough zero bits.
22571   // FIXME: Do this for non-constant compares for constant on LHS?
22572   if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22573       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22574       cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22575       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22576     CmpVT = MVT::i32;
22577     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22578     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22579   }
22580 
22581   // 0-x == y --> x+y == 0
22582   // 0-x != y --> x+y != 0
22583   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22584       Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22585     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22586     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22587     return Add.getValue(1);
22588   }
22589 
22590   // x == 0-y --> x+y == 0
22591   // x != 0-y --> x+y != 0
22592   if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22593       Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22594     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22595     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22596     return Add.getValue(1);
22597   }
22598 
22599   // Use SUB instead of CMP to enable CSE between SUB and CMP.
22600   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22601   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22602   return Sub.getValue(1);
22603 }
22604 
22605 /// Check if replacement of SQRT with RSQRT should be disabled.
22606 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22607   EVT VT = Op.getValueType();
22608 
22609   // We never want to use both SQRT and RSQRT instructions for the same input.
22610   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22611     return false;
22612 
22613   if (VT.isVector())
22614     return Subtarget.hasFastVectorFSQRT();
22615   return Subtarget.hasFastScalarFSQRT();
22616 }
22617 
22618 /// The minimum architected relative accuracy is 2^-12. We need one
22619 /// Newton-Raphson step to have a good float result (24 bits of precision).
22620 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22621                                            SelectionDAG &DAG, int Enabled,
22622                                            int &RefinementSteps,
22623                                            bool &UseOneConstNR,
22624                                            bool Reciprocal) const {
22625   EVT VT = Op.getValueType();
22626 
22627   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22628   // It is likely not profitable to do this for f64 because a double-precision
22629   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22630   // instructions: convert to single, rsqrtss, convert back to double, refine
22631   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22632   // along with FMA, this could be a throughput win.
22633   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22634   // after legalize types.
22635   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22636       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22637       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22638       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22639       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22640     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22641       RefinementSteps = 1;
22642 
22643     UseOneConstNR = false;
22644     // There is no FSQRT for 512-bits, but there is RSQRT14.
22645     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22646     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22647   }
22648   return SDValue();
22649 }
22650 
22651 /// The minimum architected relative accuracy is 2^-12. We need one
22652 /// Newton-Raphson step to have a good float result (24 bits of precision).
22653 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22654                                             int Enabled,
22655                                             int &RefinementSteps) const {
22656   EVT VT = Op.getValueType();
22657 
22658   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22659   // It is likely not profitable to do this for f64 because a double-precision
22660   // reciprocal estimate with refinement on x86 prior to FMA requires
22661   // 15 instructions: convert to single, rcpss, convert back to double, refine
22662   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22663   // along with FMA, this could be a throughput win.
22664 
22665   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22666       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22667       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22668       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22669     // Enable estimate codegen with 1 refinement step for vector division.
22670     // Scalar division estimates are disabled because they break too much
22671     // real-world code. These defaults are intended to match GCC behavior.
22672     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22673       return SDValue();
22674 
22675     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22676       RefinementSteps = 1;
22677 
22678     // There is no FSQRT for 512-bits, but there is RCP14.
22679     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22680     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22681   }
22682   return SDValue();
22683 }
22684 
22685 /// If we have at least two divisions that use the same divisor, convert to
22686 /// multiplication by a reciprocal. This may need to be adjusted for a given
22687 /// CPU if a division's cost is not at least twice the cost of a multiplication.
22688 /// This is because we still need one division to calculate the reciprocal and
22689 /// then we need two multiplies by that reciprocal as replacements for the
22690 /// original divisions.
22691 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22692   return 2;
22693 }
22694 
22695 SDValue
22696 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22697                                  SelectionDAG &DAG,
22698                                  SmallVectorImpl<SDNode *> &Created) const {
22699   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22700   if (isIntDivCheap(N->getValueType(0), Attr))
22701     return SDValue(N,0); // Lower SDIV as SDIV
22702 
22703   assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
22704          "Unexpected divisor!");
22705 
22706   // Only perform this transform if CMOV is supported otherwise the select
22707   // below will become a branch.
22708   if (!Subtarget.hasCMov())
22709     return SDValue();
22710 
22711   // fold (sdiv X, pow2)
22712   EVT VT = N->getValueType(0);
22713   // FIXME: Support i8.
22714   if (VT != MVT::i16 && VT != MVT::i32 &&
22715       !(Subtarget.is64Bit() && VT == MVT::i64))
22716     return SDValue();
22717 
22718   unsigned Lg2 = Divisor.countTrailingZeros();
22719 
22720   // If the divisor is 2 or -2, the default expansion is better.
22721   if (Lg2 == 1)
22722     return SDValue();
22723 
22724   SDLoc DL(N);
22725   SDValue N0 = N->getOperand(0);
22726   SDValue Zero = DAG.getConstant(0, DL, VT);
22727   APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22728   SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22729 
22730   // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22731   SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22732   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22733   SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22734 
22735   Created.push_back(Cmp.getNode());
22736   Created.push_back(Add.getNode());
22737   Created.push_back(CMov.getNode());
22738 
22739   // Divide by pow2.
22740   SDValue SRA =
22741       DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22742 
22743   // If we're dividing by a positive value, we're done.  Otherwise, we must
22744   // negate the result.
22745   if (Divisor.isNonNegative())
22746     return SRA;
22747 
22748   Created.push_back(SRA.getNode());
22749   return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22750 }
22751 
22752 /// Result of 'and' is compared against zero. Change to a BT node if possible.
22753 /// Returns the BT node and the condition code needed to use it.
22754 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22755                             const SDLoc &dl, SelectionDAG &DAG,
22756                             SDValue &X86CC) {
22757   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22758   SDValue Op0 = And.getOperand(0);
22759   SDValue Op1 = And.getOperand(1);
22760   if (Op0.getOpcode() == ISD::TRUNCATE)
22761     Op0 = Op0.getOperand(0);
22762   if (Op1.getOpcode() == ISD::TRUNCATE)
22763     Op1 = Op1.getOperand(0);
22764 
22765   SDValue Src, BitNo;
22766   if (Op1.getOpcode() == ISD::SHL)
22767     std::swap(Op0, Op1);
22768   if (Op0.getOpcode() == ISD::SHL) {
22769     if (isOneConstant(Op0.getOperand(0))) {
22770       // If we looked past a truncate, check that it's only truncating away
22771       // known zeros.
22772       unsigned BitWidth = Op0.getValueSizeInBits();
22773       unsigned AndBitWidth = And.getValueSizeInBits();
22774       if (BitWidth > AndBitWidth) {
22775         KnownBits Known = DAG.computeKnownBits(Op0);
22776         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22777           return SDValue();
22778       }
22779       Src = Op1;
22780       BitNo = Op0.getOperand(1);
22781     }
22782   } else if (Op1.getOpcode() == ISD::Constant) {
22783     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22784     uint64_t AndRHSVal = AndRHS->getZExtValue();
22785     SDValue AndLHS = Op0;
22786 
22787     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22788       Src = AndLHS.getOperand(0);
22789       BitNo = AndLHS.getOperand(1);
22790     } else {
22791       // Use BT if the immediate can't be encoded in a TEST instruction or we
22792       // are optimizing for size and the immedaite won't fit in a byte.
22793       bool OptForSize = DAG.shouldOptForSize();
22794       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22795           isPowerOf2_64(AndRHSVal)) {
22796         Src = AndLHS;
22797         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22798                                 Src.getValueType());
22799       }
22800     }
22801   }
22802 
22803   // No patterns found, give up.
22804   if (!Src.getNode())
22805     return SDValue();
22806 
22807   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
22808   // instruction.  Since the shift amount is in-range-or-undefined, we know
22809   // that doing a bittest on the i32 value is ok.  We extend to i32 because
22810   // the encoding for the i16 version is larger than the i32 version.
22811   // Also promote i16 to i32 for performance / code size reason.
22812   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22813     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22814 
22815   // See if we can use the 32-bit instruction instead of the 64-bit one for a
22816   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22817   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22818   // known to be zero.
22819   if (Src.getValueType() == MVT::i64 &&
22820       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22821     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22822 
22823   // If the operand types disagree, extend the shift amount to match.  Since
22824   // BT ignores high bits (like shifts) we can use anyextend.
22825   if (Src.getValueType() != BitNo.getValueType())
22826     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22827 
22828   X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22829                                 dl, MVT::i8);
22830   return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22831 }
22832 
22833 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22834 /// CMPs.
22835 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22836                                    SDValue &Op1, bool &IsAlwaysSignaling) {
22837   unsigned SSECC;
22838   bool Swap = false;
22839 
22840   // SSE Condition code mapping:
22841   //  0 - EQ
22842   //  1 - LT
22843   //  2 - LE
22844   //  3 - UNORD
22845   //  4 - NEQ
22846   //  5 - NLT
22847   //  6 - NLE
22848   //  7 - ORD
22849   switch (SetCCOpcode) {
22850   default: llvm_unreachable("Unexpected SETCC condition");
22851   case ISD::SETOEQ:
22852   case ISD::SETEQ:  SSECC = 0; break;
22853   case ISD::SETOGT:
22854   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
22855   case ISD::SETLT:
22856   case ISD::SETOLT: SSECC = 1; break;
22857   case ISD::SETOGE:
22858   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
22859   case ISD::SETLE:
22860   case ISD::SETOLE: SSECC = 2; break;
22861   case ISD::SETUO:  SSECC = 3; break;
22862   case ISD::SETUNE:
22863   case ISD::SETNE:  SSECC = 4; break;
22864   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
22865   case ISD::SETUGE: SSECC = 5; break;
22866   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
22867   case ISD::SETUGT: SSECC = 6; break;
22868   case ISD::SETO:   SSECC = 7; break;
22869   case ISD::SETUEQ: SSECC = 8; break;
22870   case ISD::SETONE: SSECC = 12; break;
22871   }
22872   if (Swap)
22873     std::swap(Op0, Op1);
22874 
22875   switch (SetCCOpcode) {
22876   default:
22877     IsAlwaysSignaling = true;
22878     break;
22879   case ISD::SETEQ:
22880   case ISD::SETOEQ:
22881   case ISD::SETUEQ:
22882   case ISD::SETNE:
22883   case ISD::SETONE:
22884   case ISD::SETUNE:
22885   case ISD::SETO:
22886   case ISD::SETUO:
22887     IsAlwaysSignaling = false;
22888     break;
22889   }
22890 
22891   return SSECC;
22892 }
22893 
22894 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22895 /// concatenate the result back.
22896 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22897                               ISD::CondCode Cond, SelectionDAG &DAG,
22898                               const SDLoc &dl) {
22899   assert(VT.isInteger() && VT == LHS.getValueType() &&
22900          VT == RHS.getValueType() && "Unsupported VTs!");
22901 
22902   SDValue CC = DAG.getCondCode(Cond);
22903 
22904   // Extract the LHS Lo/Hi vectors
22905   SDValue LHS1, LHS2;
22906   std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22907 
22908   // Extract the RHS Lo/Hi vectors
22909   SDValue RHS1, RHS2;
22910   std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22911 
22912   // Issue the operation on the smaller types and concatenate the result back
22913   EVT LoVT, HiVT;
22914   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22915   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22916                      DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22917                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22918 }
22919 
22920 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22921 
22922   SDValue Op0 = Op.getOperand(0);
22923   SDValue Op1 = Op.getOperand(1);
22924   SDValue CC = Op.getOperand(2);
22925   MVT VT = Op.getSimpleValueType();
22926   SDLoc dl(Op);
22927 
22928   assert(VT.getVectorElementType() == MVT::i1 &&
22929          "Cannot set masked compare for this operation");
22930 
22931   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22932 
22933   // Prefer SETGT over SETLT.
22934   if (SetCCOpcode == ISD::SETLT) {
22935     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22936     std::swap(Op0, Op1);
22937   }
22938 
22939   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22940 }
22941 
22942 /// Given a buildvector constant, return a new vector constant with each element
22943 /// incremented or decremented. If incrementing or decrementing would result in
22944 /// unsigned overflow or underflow or this is not a simple vector constant,
22945 /// return an empty value.
22946 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22947   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22948   if (!BV)
22949     return SDValue();
22950 
22951   MVT VT = V.getSimpleValueType();
22952   MVT EltVT = VT.getVectorElementType();
22953   unsigned NumElts = VT.getVectorNumElements();
22954   SmallVector<SDValue, 8> NewVecC;
22955   SDLoc DL(V);
22956   for (unsigned i = 0; i < NumElts; ++i) {
22957     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22958     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22959       return SDValue();
22960 
22961     // Avoid overflow/underflow.
22962     const APInt &EltC = Elt->getAPIntValue();
22963     if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22964       return SDValue();
22965 
22966     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22967   }
22968 
22969   return DAG.getBuildVector(VT, DL, NewVecC);
22970 }
22971 
22972 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22973 /// Op0 u<= Op1:
22974 ///   t = psubus Op0, Op1
22975 ///   pcmpeq t, <0..0>
22976 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22977                                     ISD::CondCode Cond, const SDLoc &dl,
22978                                     const X86Subtarget &Subtarget,
22979                                     SelectionDAG &DAG) {
22980   if (!Subtarget.hasSSE2())
22981     return SDValue();
22982 
22983   MVT VET = VT.getVectorElementType();
22984   if (VET != MVT::i8 && VET != MVT::i16)
22985     return SDValue();
22986 
22987   switch (Cond) {
22988   default:
22989     return SDValue();
22990   case ISD::SETULT: {
22991     // If the comparison is against a constant we can turn this into a
22992     // setule.  With psubus, setule does not require a swap.  This is
22993     // beneficial because the constant in the register is no longer
22994     // destructed as the destination so it can be hoisted out of a loop.
22995     // Only do this pre-AVX since vpcmp* is no longer destructive.
22996     if (Subtarget.hasAVX())
22997       return SDValue();
22998     SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22999     if (!ULEOp1)
23000       return SDValue();
23001     Op1 = ULEOp1;
23002     break;
23003   }
23004   case ISD::SETUGT: {
23005     // If the comparison is against a constant, we can turn this into a setuge.
23006     // This is beneficial because materializing a constant 0 for the PCMPEQ is
23007     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23008     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23009     SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23010     if (!UGEOp1)
23011       return SDValue();
23012     Op1 = Op0;
23013     Op0 = UGEOp1;
23014     break;
23015   }
23016   // Psubus is better than flip-sign because it requires no inversion.
23017   case ISD::SETUGE:
23018     std::swap(Op0, Op1);
23019     break;
23020   case ISD::SETULE:
23021     break;
23022   }
23023 
23024   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23025   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23026                      DAG.getConstant(0, dl, VT));
23027 }
23028 
23029 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23030                            SelectionDAG &DAG) {
23031   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23032                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23033   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23034   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23035   SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23036   MVT VT = Op->getSimpleValueType(0);
23037   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23038   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23039   SDLoc dl(Op);
23040 
23041   if (isFP) {
23042 #ifndef NDEBUG
23043     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23044     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
23045 #endif
23046 
23047     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23048     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23049 
23050     // If we have a strict compare with a vXi1 result and the input is 128/256
23051     // bits we can't use a masked compare unless we have VLX. If we use a wider
23052     // compare like we do for non-strict, we might trigger spurious exceptions
23053     // from the upper elements. Instead emit a AVX compare and convert to mask.
23054     unsigned Opc;
23055     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23056         (!IsStrict || Subtarget.hasVLX() ||
23057          Op0.getSimpleValueType().is512BitVector())) {
23058       assert(VT.getVectorNumElements() <= 16);
23059       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23060     } else {
23061       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23062       // The SSE/AVX packed FP comparison nodes are defined with a
23063       // floating-point vector result that matches the operand type. This allows
23064       // them to work with an SSE1 target (integer vector types are not legal).
23065       VT = Op0.getSimpleValueType();
23066     }
23067 
23068     SDValue Cmp;
23069     bool IsAlwaysSignaling;
23070     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23071     if (!Subtarget.hasAVX()) {
23072       // TODO: We could use following steps to handle a quiet compare with
23073       // signaling encodings.
23074       // 1. Get ordered masks from a quiet ISD::SETO
23075       // 2. Use the masks to mask potential unordered elements in operand A, B
23076       // 3. Get the compare results of masked A, B
23077       // 4. Calculating final result using the mask and result from 3
23078       // But currently, we just fall back to scalar operations.
23079       if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23080         return SDValue();
23081 
23082       // Insert an extra signaling instruction to raise exception.
23083       if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23084         SDValue SignalCmp = DAG.getNode(
23085             Opc, dl, {VT, MVT::Other},
23086             {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23087         // FIXME: It seems we need to update the flags of all new strict nodes.
23088         // Otherwise, mayRaiseFPException in MI will return false due to
23089         // NoFPExcept = false by default. However, I didn't find it in other
23090         // patches.
23091         SignalCmp->setFlags(Op->getFlags());
23092         Chain = SignalCmp.getValue(1);
23093       }
23094 
23095       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23096       // emit two comparisons and a logic op to tie them together.
23097       if (SSECC >= 8) {
23098         // LLVM predicate is SETUEQ or SETONE.
23099         unsigned CC0, CC1;
23100         unsigned CombineOpc;
23101         if (Cond == ISD::SETUEQ) {
23102           CC0 = 3; // UNORD
23103           CC1 = 0; // EQ
23104           CombineOpc = X86ISD::FOR;
23105         } else {
23106           assert(Cond == ISD::SETONE);
23107           CC0 = 7; // ORD
23108           CC1 = 4; // NEQ
23109           CombineOpc = X86ISD::FAND;
23110         }
23111 
23112         SDValue Cmp0, Cmp1;
23113         if (IsStrict) {
23114           Cmp0 = DAG.getNode(
23115               Opc, dl, {VT, MVT::Other},
23116               {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23117           Cmp1 = DAG.getNode(
23118               Opc, dl, {VT, MVT::Other},
23119               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23120           Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23121                               Cmp1.getValue(1));
23122         } else {
23123           Cmp0 = DAG.getNode(
23124               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23125           Cmp1 = DAG.getNode(
23126               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23127         }
23128         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23129       } else {
23130         if (IsStrict) {
23131           Cmp = DAG.getNode(
23132               Opc, dl, {VT, MVT::Other},
23133               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23134           Chain = Cmp.getValue(1);
23135         } else
23136           Cmp = DAG.getNode(
23137               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23138       }
23139     } else {
23140       // Handle all other FP comparisons here.
23141       if (IsStrict) {
23142         // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23143         SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23144         Cmp = DAG.getNode(
23145             Opc, dl, {VT, MVT::Other},
23146             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23147         Chain = Cmp.getValue(1);
23148       } else
23149         Cmp = DAG.getNode(
23150             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23151     }
23152 
23153     if (VT.getFixedSizeInBits() >
23154         Op.getSimpleValueType().getFixedSizeInBits()) {
23155       // We emitted a compare with an XMM/YMM result. Finish converting to a
23156       // mask register using a vptestm.
23157       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23158       Cmp = DAG.getBitcast(CastVT, Cmp);
23159       Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23160                          DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23161     } else {
23162       // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23163       // the result type of SETCC. The bitcast is expected to be optimized
23164       // away during combining/isel.
23165       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23166     }
23167 
23168     if (IsStrict)
23169       return DAG.getMergeValues({Cmp, Chain}, dl);
23170 
23171     return Cmp;
23172   }
23173 
23174   assert(!IsStrict && "Strict SETCC only handles FP operands.");
23175 
23176   MVT VTOp0 = Op0.getSimpleValueType();
23177   (void)VTOp0;
23178   assert(VTOp0 == Op1.getSimpleValueType() &&
23179          "Expected operands with same type!");
23180   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23181          "Invalid number of packed elements for source and destination!");
23182 
23183   // The non-AVX512 code below works under the assumption that source and
23184   // destination types are the same.
23185   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23186          "Value types for source and destination must be the same!");
23187 
23188   // The result is boolean, but operands are int/float
23189   if (VT.getVectorElementType() == MVT::i1) {
23190     // In AVX-512 architecture setcc returns mask with i1 elements,
23191     // But there is no compare instruction for i8 and i16 elements in KNL.
23192     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23193            "Unexpected operand type");
23194     return LowerIntVSETCC_AVX512(Op, DAG);
23195   }
23196 
23197   // Lower using XOP integer comparisons.
23198   if (VT.is128BitVector() && Subtarget.hasXOP()) {
23199     // Translate compare code to XOP PCOM compare mode.
23200     unsigned CmpMode = 0;
23201     switch (Cond) {
23202     default: llvm_unreachable("Unexpected SETCC condition");
23203     case ISD::SETULT:
23204     case ISD::SETLT: CmpMode = 0x00; break;
23205     case ISD::SETULE:
23206     case ISD::SETLE: CmpMode = 0x01; break;
23207     case ISD::SETUGT:
23208     case ISD::SETGT: CmpMode = 0x02; break;
23209     case ISD::SETUGE:
23210     case ISD::SETGE: CmpMode = 0x03; break;
23211     case ISD::SETEQ: CmpMode = 0x04; break;
23212     case ISD::SETNE: CmpMode = 0x05; break;
23213     }
23214 
23215     // Are we comparing unsigned or signed integers?
23216     unsigned Opc =
23217         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23218 
23219     return DAG.getNode(Opc, dl, VT, Op0, Op1,
23220                        DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23221   }
23222 
23223   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23224   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23225   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23226     SDValue BC0 = peekThroughBitcasts(Op0);
23227     if (BC0.getOpcode() == ISD::AND) {
23228       APInt UndefElts;
23229       SmallVector<APInt, 64> EltBits;
23230       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23231                                         VT.getScalarSizeInBits(), UndefElts,
23232                                         EltBits, false, false)) {
23233         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23234           Cond = ISD::SETEQ;
23235           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23236         }
23237       }
23238     }
23239   }
23240 
23241   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23242   if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23243       Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23244     ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23245     if (C1 && C1->getAPIntValue().isPowerOf2()) {
23246       unsigned BitWidth = VT.getScalarSizeInBits();
23247       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23248 
23249       SDValue Result = Op0.getOperand(0);
23250       Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23251                            DAG.getConstant(ShiftAmt, dl, VT));
23252       Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23253                            DAG.getConstant(BitWidth - 1, dl, VT));
23254       return Result;
23255     }
23256   }
23257 
23258   // Break 256-bit integer vector compare into smaller ones.
23259   if (VT.is256BitVector() && !Subtarget.hasInt256())
23260     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23261 
23262   if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23263     assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
23264     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23265   }
23266 
23267   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23268   // not-of-PCMPEQ:
23269   // X != INT_MIN --> X >s INT_MIN
23270   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23271   // +X != 0 --> +X >s 0
23272   APInt ConstValue;
23273   if (Cond == ISD::SETNE &&
23274       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23275     if (ConstValue.isMinSignedValue())
23276       Cond = ISD::SETGT;
23277     else if (ConstValue.isMaxSignedValue())
23278       Cond = ISD::SETLT;
23279     else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23280       Cond = ISD::SETGT;
23281   }
23282 
23283   // If both operands are known non-negative, then an unsigned compare is the
23284   // same as a signed compare and there's no need to flip signbits.
23285   // TODO: We could check for more general simplifications here since we're
23286   // computing known bits.
23287   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23288                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23289 
23290   // Special case: Use min/max operations for unsigned compares.
23291   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23292   if (ISD::isUnsignedIntSetCC(Cond) &&
23293       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23294       TLI.isOperationLegal(ISD::UMIN, VT)) {
23295     // If we have a constant operand, increment/decrement it and change the
23296     // condition to avoid an invert.
23297     if (Cond == ISD::SETUGT) {
23298       // X > C --> X >= (C+1) --> X == umax(X, C+1)
23299       if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23300         Op1 = UGTOp1;
23301         Cond = ISD::SETUGE;
23302       }
23303     }
23304     if (Cond == ISD::SETULT) {
23305       // X < C --> X <= (C-1) --> X == umin(X, C-1)
23306       if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23307         Op1 = ULTOp1;
23308         Cond = ISD::SETULE;
23309       }
23310     }
23311     bool Invert = false;
23312     unsigned Opc;
23313     switch (Cond) {
23314     default: llvm_unreachable("Unexpected condition code");
23315     case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
23316     case ISD::SETULE: Opc = ISD::UMIN; break;
23317     case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
23318     case ISD::SETUGE: Opc = ISD::UMAX; break;
23319     }
23320 
23321     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23322     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23323 
23324     // If the logical-not of the result is required, perform that now.
23325     if (Invert)
23326       Result = DAG.getNOT(dl, Result, VT);
23327 
23328     return Result;
23329   }
23330 
23331   // Try to use SUBUS and PCMPEQ.
23332   if (FlipSigns)
23333     if (SDValue V =
23334             LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23335       return V;
23336 
23337   // We are handling one of the integer comparisons here. Since SSE only has
23338   // GT and EQ comparisons for integer, swapping operands and multiple
23339   // operations may be required for some comparisons.
23340   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23341                                                             : X86ISD::PCMPGT;
23342   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23343               Cond == ISD::SETGE || Cond == ISD::SETUGE;
23344   bool Invert = Cond == ISD::SETNE ||
23345                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23346 
23347   if (Swap)
23348     std::swap(Op0, Op1);
23349 
23350   // Check that the operation in question is available (most are plain SSE2,
23351   // but PCMPGTQ and PCMPEQQ have different requirements).
23352   if (VT == MVT::v2i64) {
23353     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23354       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23355 
23356       // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23357       // the odd elements over the even elements.
23358       if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23359         Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23360         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23361 
23362         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23363         static const int MaskHi[] = { 1, 1, 3, 3 };
23364         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23365 
23366         return DAG.getBitcast(VT, Result);
23367       }
23368 
23369       if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23370         Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23371         Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23372 
23373         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23374         static const int MaskHi[] = { 1, 1, 3, 3 };
23375         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23376 
23377         return DAG.getBitcast(VT, Result);
23378       }
23379 
23380       // Since SSE has no unsigned integer comparisons, we need to flip the sign
23381       // bits of the inputs before performing those operations. The lower
23382       // compare is always unsigned.
23383       SDValue SB;
23384       if (FlipSigns) {
23385         SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23386       } else {
23387         SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23388       }
23389       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23390       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23391 
23392       // Cast everything to the right type.
23393       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23394       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23395 
23396       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23397       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23398       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23399 
23400       // Create masks for only the low parts/high parts of the 64 bit integers.
23401       static const int MaskHi[] = { 1, 1, 3, 3 };
23402       static const int MaskLo[] = { 0, 0, 2, 2 };
23403       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23404       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23405       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23406 
23407       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23408       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23409 
23410       if (Invert)
23411         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23412 
23413       return DAG.getBitcast(VT, Result);
23414     }
23415 
23416     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23417       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23418       // pcmpeqd + pshufd + pand.
23419       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23420 
23421       // First cast everything to the right type.
23422       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23423       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23424 
23425       // Do the compare.
23426       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23427 
23428       // Make sure the lower and upper halves are both all-ones.
23429       static const int Mask[] = { 1, 0, 3, 2 };
23430       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23431       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23432 
23433       if (Invert)
23434         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23435 
23436       return DAG.getBitcast(VT, Result);
23437     }
23438   }
23439 
23440   // Since SSE has no unsigned integer comparisons, we need to flip the sign
23441   // bits of the inputs before performing those operations.
23442   if (FlipSigns) {
23443     MVT EltVT = VT.getVectorElementType();
23444     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23445                                  VT);
23446     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23447     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23448   }
23449 
23450   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23451 
23452   // If the logical-not of the result is required, perform that now.
23453   if (Invert)
23454     Result = DAG.getNOT(dl, Result, VT);
23455 
23456   return Result;
23457 }
23458 
23459 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23460 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23461                               const SDLoc &dl, SelectionDAG &DAG,
23462                               const X86Subtarget &Subtarget,
23463                               SDValue &X86CC) {
23464   // Only support equality comparisons.
23465   if (CC != ISD::SETEQ && CC != ISD::SETNE)
23466     return SDValue();
23467 
23468   // Must be a bitcast from vXi1.
23469   if (Op0.getOpcode() != ISD::BITCAST)
23470     return SDValue();
23471 
23472   Op0 = Op0.getOperand(0);
23473   MVT VT = Op0.getSimpleValueType();
23474   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23475       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23476       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23477     return SDValue();
23478 
23479   X86::CondCode X86Cond;
23480   if (isNullConstant(Op1)) {
23481     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23482   } else if (isAllOnesConstant(Op1)) {
23483     // C flag is set for all ones.
23484     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23485   } else
23486     return SDValue();
23487 
23488   // If the input is an AND, we can combine it's operands into the KTEST.
23489   bool KTestable = false;
23490   if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23491     KTestable = true;
23492   if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23493     KTestable = true;
23494   if (!isNullConstant(Op1))
23495     KTestable = false;
23496   if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23497     SDValue LHS = Op0.getOperand(0);
23498     SDValue RHS = Op0.getOperand(1);
23499     X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23500     return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23501   }
23502 
23503   // If the input is an OR, we can combine it's operands into the KORTEST.
23504   SDValue LHS = Op0;
23505   SDValue RHS = Op0;
23506   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23507     LHS = Op0.getOperand(0);
23508     RHS = Op0.getOperand(1);
23509   }
23510 
23511   X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23512   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23513 }
23514 
23515 /// Emit flags for the given setcc condition and operands. Also returns the
23516 /// corresponding X86 condition code constant in X86CC.
23517 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23518                                              ISD::CondCode CC, const SDLoc &dl,
23519                                              SelectionDAG &DAG,
23520                                              SDValue &X86CC) const {
23521   // Optimize to BT if possible.
23522   // Lower (X & (1 << N)) == 0 to BT(X, N).
23523   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23524   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23525   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23526       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23527     if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23528       return BT;
23529   }
23530 
23531   // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23532   // TODO: We could do AND tree with all 1s as well by using the C flag.
23533   if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23534     if (SDValue CmpZ =
23535             MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23536       return CmpZ;
23537 
23538   // Try to lower using KORTEST or KTEST.
23539   if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23540     return Test;
23541 
23542   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
23543   // these.
23544   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23545       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23546     // If the input is a setcc, then reuse the input setcc or use a new one with
23547     // the inverted condition.
23548     if (Op0.getOpcode() == X86ISD::SETCC) {
23549       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23550 
23551       X86CC = Op0.getOperand(0);
23552       if (Invert) {
23553         X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23554         CCode = X86::GetOppositeBranchCondition(CCode);
23555         X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23556       }
23557 
23558       return Op0.getOperand(1);
23559     }
23560   }
23561 
23562   // Try to use the carry flag from the add in place of an separate CMP for:
23563   // (seteq (add X, -1), -1). Similar for setne.
23564   if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23565       Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23566     if (isProfitableToUseFlagOp(Op0)) {
23567       SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23568 
23569       SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23570                                 Op0.getOperand(1));
23571       DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23572       X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23573       X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23574       return SDValue(New.getNode(), 1);
23575     }
23576   }
23577 
23578   X86::CondCode CondCode =
23579       TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23580   assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23581 
23582   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23583   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23584   return EFLAGS;
23585 }
23586 
23587 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23588 
23589   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23590                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23591   MVT VT = Op->getSimpleValueType(0);
23592 
23593   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23594 
23595   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23596   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23597   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23598   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23599   SDLoc dl(Op);
23600   ISD::CondCode CC =
23601       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23602 
23603   // Handle f128 first, since one possible outcome is a normal integer
23604   // comparison which gets handled by emitFlagsForSetcc.
23605   if (Op0.getValueType() == MVT::f128) {
23606     softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23607                         Op.getOpcode() == ISD::STRICT_FSETCCS);
23608 
23609     // If softenSetCCOperands returned a scalar, use it.
23610     if (!Op1.getNode()) {
23611       assert(Op0.getValueType() == Op.getValueType() &&
23612              "Unexpected setcc expansion!");
23613       if (IsStrict)
23614         return DAG.getMergeValues({Op0, Chain}, dl);
23615       return Op0;
23616     }
23617   }
23618 
23619   if (Op0.getSimpleValueType().isInteger()) {
23620     // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23621     // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23622     // this may translate to less uops depending on uarch implementation. The
23623     // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23624     // canonicalize to that CondCode.
23625     // NOTE: Only do this if incrementing the constant doesn't increase the bit
23626     // encoding size - so it must either already be a i8 or i32 immediate, or it
23627     // shrinks down to that. We don't do this for any i64's to avoid additional
23628     // constant materializations.
23629     // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23630     if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23631       const APInt &Op1Val = Op1C->getAPIntValue();
23632       if (!Op1Val.isNullValue()) {
23633         // Ensure the constant+1 doesn't overflow.
23634         if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23635             (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23636           APInt Op1ValPlusOne = Op1Val + 1;
23637           if (Op1ValPlusOne.isSignedIntN(32) &&
23638               (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23639             Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23640             CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23641                                             : ISD::CondCode::SETUGE;
23642           }
23643         }
23644       }
23645     }
23646 
23647     SDValue X86CC;
23648     SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23649     SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23650     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23651   }
23652 
23653   // Handle floating point.
23654   X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23655   if (CondCode == X86::COND_INVALID)
23656     return SDValue();
23657 
23658   SDValue EFLAGS;
23659   if (IsStrict) {
23660     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23661     EFLAGS =
23662         DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23663                     dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23664     Chain = EFLAGS.getValue(1);
23665   } else {
23666     EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23667   }
23668 
23669   SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23670   SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23671   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23672 }
23673 
23674 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23675   SDValue LHS = Op.getOperand(0);
23676   SDValue RHS = Op.getOperand(1);
23677   SDValue Carry = Op.getOperand(2);
23678   SDValue Cond = Op.getOperand(3);
23679   SDLoc DL(Op);
23680 
23681   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23682   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23683 
23684   // Recreate the carry if needed.
23685   EVT CarryVT = Carry.getValueType();
23686   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23687                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
23688 
23689   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23690   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23691   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23692 }
23693 
23694 // This function returns three things: the arithmetic computation itself
23695 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
23696 // flag and the condition code define the case in which the arithmetic
23697 // computation overflows.
23698 static std::pair<SDValue, SDValue>
23699 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23700   assert(Op.getResNo() == 0 && "Unexpected result number!");
23701   SDValue Value, Overflow;
23702   SDValue LHS = Op.getOperand(0);
23703   SDValue RHS = Op.getOperand(1);
23704   unsigned BaseOp = 0;
23705   SDLoc DL(Op);
23706   switch (Op.getOpcode()) {
23707   default: llvm_unreachable("Unknown ovf instruction!");
23708   case ISD::SADDO:
23709     BaseOp = X86ISD::ADD;
23710     Cond = X86::COND_O;
23711     break;
23712   case ISD::UADDO:
23713     BaseOp = X86ISD::ADD;
23714     Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23715     break;
23716   case ISD::SSUBO:
23717     BaseOp = X86ISD::SUB;
23718     Cond = X86::COND_O;
23719     break;
23720   case ISD::USUBO:
23721     BaseOp = X86ISD::SUB;
23722     Cond = X86::COND_B;
23723     break;
23724   case ISD::SMULO:
23725     BaseOp = X86ISD::SMUL;
23726     Cond = X86::COND_O;
23727     break;
23728   case ISD::UMULO:
23729     BaseOp = X86ISD::UMUL;
23730     Cond = X86::COND_O;
23731     break;
23732   }
23733 
23734   if (BaseOp) {
23735     // Also sets EFLAGS.
23736     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23737     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23738     Overflow = Value.getValue(1);
23739   }
23740 
23741   return std::make_pair(Value, Overflow);
23742 }
23743 
23744 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23745   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23746   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23747   // looks for this combo and may remove the "setcc" instruction if the "setcc"
23748   // has only one use.
23749   SDLoc DL(Op);
23750   X86::CondCode Cond;
23751   SDValue Value, Overflow;
23752   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23753 
23754   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23755   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23756   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23757 }
23758 
23759 /// Return true if opcode is a X86 logical comparison.
23760 static bool isX86LogicalCmp(SDValue Op) {
23761   unsigned Opc = Op.getOpcode();
23762   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23763       Opc == X86ISD::FCMP)
23764     return true;
23765   if (Op.getResNo() == 1 &&
23766       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23767        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23768        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23769     return true;
23770 
23771   return false;
23772 }
23773 
23774 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23775   if (V.getOpcode() != ISD::TRUNCATE)
23776     return false;
23777 
23778   SDValue VOp0 = V.getOperand(0);
23779   unsigned InBits = VOp0.getValueSizeInBits();
23780   unsigned Bits = V.getValueSizeInBits();
23781   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23782 }
23783 
23784 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23785   bool AddTest = true;
23786   SDValue Cond  = Op.getOperand(0);
23787   SDValue Op1 = Op.getOperand(1);
23788   SDValue Op2 = Op.getOperand(2);
23789   SDLoc DL(Op);
23790   MVT VT = Op1.getSimpleValueType();
23791   SDValue CC;
23792 
23793   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23794   // are available or VBLENDV if AVX is available.
23795   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23796   if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23797       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23798     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23799     bool IsAlwaysSignaling;
23800     unsigned SSECC =
23801         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23802                            CondOp0, CondOp1, IsAlwaysSignaling);
23803 
23804     if (Subtarget.hasAVX512()) {
23805       SDValue Cmp =
23806           DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23807                       DAG.getTargetConstant(SSECC, DL, MVT::i8));
23808       assert(!VT.isVector() && "Not a scalar type?");
23809       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23810     }
23811 
23812     if (SSECC < 8 || Subtarget.hasAVX()) {
23813       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23814                                 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23815 
23816       // If we have AVX, we can use a variable vector select (VBLENDV) instead
23817       // of 3 logic instructions for size savings and potentially speed.
23818       // Unfortunately, there is no scalar form of VBLENDV.
23819 
23820       // If either operand is a +0.0 constant, don't try this. We can expect to
23821       // optimize away at least one of the logic instructions later in that
23822       // case, so that sequence would be faster than a variable blend.
23823 
23824       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23825       // uses XMM0 as the selection register. That may need just as many
23826       // instructions as the AND/ANDN/OR sequence due to register moves, so
23827       // don't bother.
23828       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23829           !isNullFPConstant(Op2)) {
23830         // Convert to vectors, do a VSELECT, and convert back to scalar.
23831         // All of the conversions should be optimized away.
23832         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23833         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23834         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23835         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23836 
23837         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23838         VCmp = DAG.getBitcast(VCmpVT, VCmp);
23839 
23840         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23841 
23842         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23843                            VSel, DAG.getIntPtrConstant(0, DL));
23844       }
23845       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23846       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23847       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23848     }
23849   }
23850 
23851   // AVX512 fallback is to lower selects of scalar floats to masked moves.
23852   if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23853     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23854     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23855   }
23856 
23857   if (Cond.getOpcode() == ISD::SETCC) {
23858     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23859       Cond = NewCond;
23860       // If the condition was updated, it's possible that the operands of the
23861       // select were also updated (for example, EmitTest has a RAUW). Refresh
23862       // the local references to the select operands in case they got stale.
23863       Op1 = Op.getOperand(1);
23864       Op2 = Op.getOperand(2);
23865     }
23866   }
23867 
23868   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23869   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23870   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23871   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23872   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23873   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23874   if (Cond.getOpcode() == X86ISD::SETCC &&
23875       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23876       isNullConstant(Cond.getOperand(1).getOperand(1))) {
23877     SDValue Cmp = Cond.getOperand(1);
23878     SDValue CmpOp0 = Cmp.getOperand(0);
23879     unsigned CondCode = Cond.getConstantOperandVal(0);
23880 
23881     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23882     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23883     // handle to keep the CMP with 0. This should be removed by
23884     // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23885     // cttz_zero_undef.
23886     auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23887       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23888               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23889     };
23890     if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23891         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23892          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23893       // Keep Cmp.
23894     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23895         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23896       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23897 
23898       SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23899       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23900 
23901       // Apply further optimizations for special cases
23902       // (select (x != 0), -1, 0) -> neg & sbb
23903       // (select (x == 0), 0, -1) -> neg & sbb
23904       if (isNullConstant(Y) &&
23905           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23906         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23907         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23908         Zero = DAG.getConstant(0, DL, Op.getValueType());
23909         return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23910       }
23911 
23912       Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23913                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23914 
23915       SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23916       SDValue Res =   // Res = 0 or -1.
23917         DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23918 
23919       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23920         Res = DAG.getNOT(DL, Res, Res.getValueType());
23921 
23922       return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23923     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23924                Cmp.getOperand(0).getOpcode() == ISD::AND &&
23925                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23926       SDValue Src1, Src2;
23927       // true if Op2 is XOR or OR operator and one of its operands
23928       // is equal to Op1
23929       // ( a , a op b) || ( b , a op b)
23930       auto isOrXorPattern = [&]() {
23931         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23932             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23933           Src1 =
23934               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23935           Src2 = Op1;
23936           return true;
23937         }
23938         return false;
23939       };
23940 
23941       if (isOrXorPattern()) {
23942         SDValue Neg;
23943         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23944         // we need mask of all zeros or ones with same size of the other
23945         // operands.
23946         if (CmpSz > VT.getSizeInBits())
23947           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23948         else if (CmpSz < VT.getSizeInBits())
23949           Neg = DAG.getNode(ISD::AND, DL, VT,
23950               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23951               DAG.getConstant(1, DL, VT));
23952         else
23953           Neg = CmpOp0;
23954         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23955                                    Neg); // -(and (x, 0x1))
23956         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23957         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
23958       }
23959     }
23960   }
23961 
23962   // Look past (and (setcc_carry (cmp ...)), 1).
23963   if (Cond.getOpcode() == ISD::AND &&
23964       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23965       isOneConstant(Cond.getOperand(1)))
23966     Cond = Cond.getOperand(0);
23967 
23968   // If condition flag is set by a X86ISD::CMP, then use it as the condition
23969   // setting operand in place of the X86ISD::SETCC.
23970   unsigned CondOpcode = Cond.getOpcode();
23971   if (CondOpcode == X86ISD::SETCC ||
23972       CondOpcode == X86ISD::SETCC_CARRY) {
23973     CC = Cond.getOperand(0);
23974 
23975     SDValue Cmp = Cond.getOperand(1);
23976     bool IllegalFPCMov = false;
23977     if (VT.isFloatingPoint() && !VT.isVector() &&
23978         !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
23979       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23980 
23981     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23982         Cmp.getOpcode() == X86ISD::BT) { // FIXME
23983       Cond = Cmp;
23984       AddTest = false;
23985     }
23986   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23987              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23988              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23989     SDValue Value;
23990     X86::CondCode X86Cond;
23991     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23992 
23993     CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23994     AddTest = false;
23995   }
23996 
23997   if (AddTest) {
23998     // Look past the truncate if the high bits are known zero.
23999     if (isTruncWithZeroHighBitsInput(Cond, DAG))
24000       Cond = Cond.getOperand(0);
24001 
24002     // We know the result of AND is compared against zero. Try to match
24003     // it to BT.
24004     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24005       SDValue BTCC;
24006       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24007         CC = BTCC;
24008         Cond = BT;
24009         AddTest = false;
24010       }
24011     }
24012   }
24013 
24014   if (AddTest) {
24015     CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24016     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24017   }
24018 
24019   // a <  b ? -1 :  0 -> RES = ~setcc_carry
24020   // a <  b ?  0 : -1 -> RES = setcc_carry
24021   // a >= b ? -1 :  0 -> RES = setcc_carry
24022   // a >= b ?  0 : -1 -> RES = ~setcc_carry
24023   if (Cond.getOpcode() == X86ISD::SUB) {
24024     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24025 
24026     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24027         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24028         (isNullConstant(Op1) || isNullConstant(Op2))) {
24029       SDValue Res =
24030           DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24031                       DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24032       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24033         return DAG.getNOT(DL, Res, Res.getValueType());
24034       return Res;
24035     }
24036   }
24037 
24038   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24039   // widen the cmov and push the truncate through. This avoids introducing a new
24040   // branch during isel and doesn't add any extensions.
24041   if (Op.getValueType() == MVT::i8 &&
24042       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24043     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24044     if (T1.getValueType() == T2.getValueType() &&
24045         // Exclude CopyFromReg to avoid partial register stalls.
24046         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24047       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24048                                  CC, Cond);
24049       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24050     }
24051   }
24052 
24053   // Or finally, promote i8 cmovs if we have CMOV,
24054   //                 or i16 cmovs if it won't prevent folding a load.
24055   // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24056   //        legal, but EmitLoweredSelect() can not deal with these extensions
24057   //        being inserted between two CMOV's. (in i16 case too TBN)
24058   //        https://bugs.llvm.org/show_bug.cgi?id=40974
24059   if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24060       (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24061        !MayFoldLoad(Op2))) {
24062     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24063     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24064     SDValue Ops[] = { Op2, Op1, CC, Cond };
24065     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24066     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24067   }
24068 
24069   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24070   // condition is true.
24071   SDValue Ops[] = { Op2, Op1, CC, Cond };
24072   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24073 }
24074 
24075 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24076                                      const X86Subtarget &Subtarget,
24077                                      SelectionDAG &DAG) {
24078   MVT VT = Op->getSimpleValueType(0);
24079   SDValue In = Op->getOperand(0);
24080   MVT InVT = In.getSimpleValueType();
24081   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24082   MVT VTElt = VT.getVectorElementType();
24083   SDLoc dl(Op);
24084 
24085   unsigned NumElts = VT.getVectorNumElements();
24086 
24087   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24088   MVT ExtVT = VT;
24089   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24090     // If v16i32 is to be avoided, we'll need to split and concatenate.
24091     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24092       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24093 
24094     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24095   }
24096 
24097   // Widen to 512-bits if VLX is not supported.
24098   MVT WideVT = ExtVT;
24099   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24100     NumElts *= 512 / ExtVT.getSizeInBits();
24101     InVT = MVT::getVectorVT(MVT::i1, NumElts);
24102     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24103                      In, DAG.getIntPtrConstant(0, dl));
24104     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24105   }
24106 
24107   SDValue V;
24108   MVT WideEltVT = WideVT.getVectorElementType();
24109   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24110       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24111     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24112   } else {
24113     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24114     SDValue Zero = DAG.getConstant(0, dl, WideVT);
24115     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24116   }
24117 
24118   // Truncate if we had to extend i16/i8 above.
24119   if (VT != ExtVT) {
24120     WideVT = MVT::getVectorVT(VTElt, NumElts);
24121     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24122   }
24123 
24124   // Extract back to 128/256-bit if we widened.
24125   if (WideVT != VT)
24126     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24127                     DAG.getIntPtrConstant(0, dl));
24128 
24129   return V;
24130 }
24131 
24132 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24133                                SelectionDAG &DAG) {
24134   SDValue In = Op->getOperand(0);
24135   MVT InVT = In.getSimpleValueType();
24136 
24137   if (InVT.getVectorElementType() == MVT::i1)
24138     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24139 
24140   assert(Subtarget.hasAVX() && "Expected AVX support");
24141   return LowerAVXExtend(Op, DAG, Subtarget);
24142 }
24143 
24144 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24145 // For sign extend this needs to handle all vector sizes and SSE4.1 and
24146 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24147 // MVT::v64i8 when BWI is not supported, but AVX512 is.
24148 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24149                                         const X86Subtarget &Subtarget,
24150                                         SelectionDAG &DAG) {
24151   SDValue In = Op->getOperand(0);
24152   MVT VT = Op->getSimpleValueType(0);
24153   MVT InVT = In.getSimpleValueType();
24154 
24155   MVT SVT = VT.getVectorElementType();
24156   MVT InSVT = InVT.getVectorElementType();
24157   assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
24158 
24159   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24160     return SDValue();
24161   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24162     return SDValue();
24163   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24164       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24165       !(VT.is512BitVector() && Subtarget.hasAVX512()))
24166     return SDValue();
24167 
24168   SDLoc dl(Op);
24169   unsigned Opc = Op.getOpcode();
24170   unsigned NumElts = VT.getVectorNumElements();
24171 
24172   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24173   // For 512-bit vectors, we need 128-bits or 256-bits.
24174   if (InVT.getSizeInBits() > 128) {
24175     // Input needs to be at least the same number of elements as output, and
24176     // at least 128-bits.
24177     int InSize = InSVT.getSizeInBits() * NumElts;
24178     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24179     InVT = In.getSimpleValueType();
24180   }
24181 
24182   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24183   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24184   // need to be handled here for 256/512-bit results.
24185   if (Subtarget.hasInt256()) {
24186     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24187 
24188     if (InVT.getVectorNumElements() != NumElts)
24189       return DAG.getNode(Op.getOpcode(), dl, VT, In);
24190 
24191     // FIXME: Apparently we create inreg operations that could be regular
24192     // extends.
24193     unsigned ExtOpc =
24194         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24195                                              : ISD::ZERO_EXTEND;
24196     return DAG.getNode(ExtOpc, dl, VT, In);
24197   }
24198 
24199   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24200   if (Subtarget.hasAVX()) {
24201     assert(VT.is256BitVector() && "256-bit vector expected");
24202     MVT HalfVT = VT.getHalfNumVectorElementsVT();
24203     int HalfNumElts = HalfVT.getVectorNumElements();
24204 
24205     unsigned NumSrcElts = InVT.getVectorNumElements();
24206     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24207     for (int i = 0; i != HalfNumElts; ++i)
24208       HiMask[i] = HalfNumElts + i;
24209 
24210     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24211     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24212     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24213     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24214   }
24215 
24216   // We should only get here for sign extend.
24217   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24218   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24219 
24220   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24221   SDValue Curr = In;
24222   SDValue SignExt = Curr;
24223 
24224   // As SRAI is only available on i16/i32 types, we expand only up to i32
24225   // and handle i64 separately.
24226   if (InVT != MVT::v4i32) {
24227     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24228 
24229     unsigned DestWidth = DestVT.getScalarSizeInBits();
24230     unsigned Scale = DestWidth / InSVT.getSizeInBits();
24231 
24232     unsigned InNumElts = InVT.getVectorNumElements();
24233     unsigned DestElts = DestVT.getVectorNumElements();
24234 
24235     // Build a shuffle mask that takes each input element and places it in the
24236     // MSBs of the new element size.
24237     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24238     for (unsigned i = 0; i != DestElts; ++i)
24239       Mask[i * Scale + (Scale - 1)] = i;
24240 
24241     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24242     Curr = DAG.getBitcast(DestVT, Curr);
24243 
24244     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24245     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24246                           DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24247   }
24248 
24249   if (VT == MVT::v2i64) {
24250     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24251     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24252     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24253     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24254     SignExt = DAG.getBitcast(VT, SignExt);
24255   }
24256 
24257   return SignExt;
24258 }
24259 
24260 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24261                                 SelectionDAG &DAG) {
24262   MVT VT = Op->getSimpleValueType(0);
24263   SDValue In = Op->getOperand(0);
24264   MVT InVT = In.getSimpleValueType();
24265   SDLoc dl(Op);
24266 
24267   if (InVT.getVectorElementType() == MVT::i1)
24268     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24269 
24270   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24271   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
24272          "Expected same number of elements");
24273   assert((VT.getVectorElementType() == MVT::i16 ||
24274           VT.getVectorElementType() == MVT::i32 ||
24275           VT.getVectorElementType() == MVT::i64) &&
24276          "Unexpected element type");
24277   assert((InVT.getVectorElementType() == MVT::i8 ||
24278           InVT.getVectorElementType() == MVT::i16 ||
24279           InVT.getVectorElementType() == MVT::i32) &&
24280          "Unexpected element type");
24281 
24282   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24283     assert(InVT == MVT::v32i8 && "Unexpected VT!");
24284     return splitVectorIntUnary(Op, DAG);
24285   }
24286 
24287   if (Subtarget.hasInt256())
24288     return Op;
24289 
24290   // Optimize vectors in AVX mode
24291   // Sign extend  v8i16 to v8i32 and
24292   //              v4i32 to v4i64
24293   //
24294   // Divide input vector into two parts
24295   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24296   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24297   // concat the vectors to original VT
24298   MVT HalfVT = VT.getHalfNumVectorElementsVT();
24299   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24300 
24301   unsigned NumElems = InVT.getVectorNumElements();
24302   SmallVector<int,8> ShufMask(NumElems, -1);
24303   for (unsigned i = 0; i != NumElems/2; ++i)
24304     ShufMask[i] = i + NumElems/2;
24305 
24306   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24307   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24308 
24309   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24310 }
24311 
24312 /// Change a vector store into a pair of half-size vector stores.
24313 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24314   SDValue StoredVal = Store->getValue();
24315   assert((StoredVal.getValueType().is256BitVector() ||
24316           StoredVal.getValueType().is512BitVector()) &&
24317          "Expecting 256/512-bit op");
24318 
24319   // Splitting volatile memory ops is not allowed unless the operation was not
24320   // legal to begin with. Assume the input store is legal (this transform is
24321   // only used for targets with AVX). Note: It is possible that we have an
24322   // illegal type like v2i128, and so we could allow splitting a volatile store
24323   // in that case if that is important.
24324   if (!Store->isSimple())
24325     return SDValue();
24326 
24327   SDLoc DL(Store);
24328   SDValue Value0, Value1;
24329   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24330   unsigned HalfOffset = Value0.getValueType().getStoreSize();
24331   SDValue Ptr0 = Store->getBasePtr();
24332   SDValue Ptr1 =
24333       DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24334   SDValue Ch0 =
24335       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24336                    Store->getOriginalAlign(),
24337                    Store->getMemOperand()->getFlags());
24338   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24339                              Store->getPointerInfo().getWithOffset(HalfOffset),
24340                              Store->getOriginalAlign(),
24341                              Store->getMemOperand()->getFlags());
24342   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24343 }
24344 
24345 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24346 /// type.
24347 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24348                                     SelectionDAG &DAG) {
24349   SDValue StoredVal = Store->getValue();
24350   assert(StoreVT.is128BitVector() &&
24351          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24352   StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24353 
24354   // Splitting volatile memory ops is not allowed unless the operation was not
24355   // legal to begin with. We are assuming the input op is legal (this transform
24356   // is only used for targets with AVX).
24357   if (!Store->isSimple())
24358     return SDValue();
24359 
24360   MVT StoreSVT = StoreVT.getScalarType();
24361   unsigned NumElems = StoreVT.getVectorNumElements();
24362   unsigned ScalarSize = StoreSVT.getStoreSize();
24363 
24364   SDLoc DL(Store);
24365   SmallVector<SDValue, 4> Stores;
24366   for (unsigned i = 0; i != NumElems; ++i) {
24367     unsigned Offset = i * ScalarSize;
24368     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24369                                            TypeSize::Fixed(Offset), DL);
24370     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24371                               DAG.getIntPtrConstant(i, DL));
24372     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24373                               Store->getPointerInfo().getWithOffset(Offset),
24374                               Store->getOriginalAlign(),
24375                               Store->getMemOperand()->getFlags());
24376     Stores.push_back(Ch);
24377   }
24378   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24379 }
24380 
24381 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24382                           SelectionDAG &DAG) {
24383   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24384   SDLoc dl(St);
24385   SDValue StoredVal = St->getValue();
24386 
24387   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24388   if (StoredVal.getValueType().isVector() &&
24389       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24390     unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24391     assert(NumElts <= 8 && "Unexpected VT");
24392     assert(!St->isTruncatingStore() && "Expected non-truncating store");
24393     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24394            "Expected AVX512F without AVX512DQI");
24395 
24396     // We must pad with zeros to ensure we store zeroes to any unused bits.
24397     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24398                             DAG.getUNDEF(MVT::v16i1), StoredVal,
24399                             DAG.getIntPtrConstant(0, dl));
24400     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24401     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24402     // Make sure we store zeros in the extra bits.
24403     if (NumElts < 8)
24404       StoredVal = DAG.getZeroExtendInReg(
24405           StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24406 
24407     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24408                         St->getPointerInfo(), St->getOriginalAlign(),
24409                         St->getMemOperand()->getFlags());
24410   }
24411 
24412   if (St->isTruncatingStore())
24413     return SDValue();
24414 
24415   // If this is a 256-bit store of concatenated ops, we are better off splitting
24416   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24417   // and each half can execute independently. Some cores would split the op into
24418   // halves anyway, so the concat (vinsertf128) is purely an extra op.
24419   MVT StoreVT = StoredVal.getSimpleValueType();
24420   if (StoreVT.is256BitVector() ||
24421       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24422        !Subtarget.hasBWI())) {
24423     SmallVector<SDValue, 4> CatOps;
24424     if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24425       return splitVectorStore(St, DAG);
24426     return SDValue();
24427   }
24428 
24429   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24430   assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
24431          "Unexpected VT");
24432   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24433              TargetLowering::TypeWidenVector && "Unexpected type action!");
24434 
24435   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24436   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24437                           DAG.getUNDEF(StoreVT));
24438 
24439   if (Subtarget.hasSSE2()) {
24440     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24441     // and store it.
24442     MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24443     MVT CastVT = MVT::getVectorVT(StVT, 2);
24444     StoredVal = DAG.getBitcast(CastVT, StoredVal);
24445     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24446                             DAG.getIntPtrConstant(0, dl));
24447 
24448     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24449                         St->getPointerInfo(), St->getOriginalAlign(),
24450                         St->getMemOperand()->getFlags());
24451   }
24452   assert(Subtarget.hasSSE1() && "Expected SSE");
24453   SDVTList Tys = DAG.getVTList(MVT::Other);
24454   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24455   return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24456                                  St->getMemOperand());
24457 }
24458 
24459 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
24460 // may emit an illegal shuffle but the expansion is still better than scalar
24461 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24462 // we'll emit a shuffle and a arithmetic shift.
24463 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24464 // TODO: It is possible to support ZExt by zeroing the undef values during
24465 // the shuffle phase or after the shuffle.
24466 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24467                                  SelectionDAG &DAG) {
24468   MVT RegVT = Op.getSimpleValueType();
24469   assert(RegVT.isVector() && "We only custom lower vector loads.");
24470   assert(RegVT.isInteger() &&
24471          "We only custom lower integer vector loads.");
24472 
24473   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24474   SDLoc dl(Ld);
24475 
24476   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24477   if (RegVT.getVectorElementType() == MVT::i1) {
24478     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24479     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24480     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24481            "Expected AVX512F without AVX512DQI");
24482 
24483     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24484                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24485                                 Ld->getMemOperand()->getFlags());
24486 
24487     // Replace chain users with the new chain.
24488     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24489 
24490     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24491     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24492                       DAG.getBitcast(MVT::v16i1, Val),
24493                       DAG.getIntPtrConstant(0, dl));
24494     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24495   }
24496 
24497   return SDValue();
24498 }
24499 
24500 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24501 /// each of which has no other use apart from the AND / OR.
24502 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24503   Opc = Op.getOpcode();
24504   if (Opc != ISD::OR && Opc != ISD::AND)
24505     return false;
24506   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24507           Op.getOperand(0).hasOneUse() &&
24508           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24509           Op.getOperand(1).hasOneUse());
24510 }
24511 
24512 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24513   SDValue Chain = Op.getOperand(0);
24514   SDValue Cond  = Op.getOperand(1);
24515   SDValue Dest  = Op.getOperand(2);
24516   SDLoc dl(Op);
24517 
24518   if (Cond.getOpcode() == ISD::SETCC &&
24519       Cond.getOperand(0).getValueType() != MVT::f128) {
24520     SDValue LHS = Cond.getOperand(0);
24521     SDValue RHS = Cond.getOperand(1);
24522     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24523 
24524     // Special case for
24525     // setcc([su]{add,sub,mul}o == 0)
24526     // setcc([su]{add,sub,mul}o != 1)
24527     if (ISD::isOverflowIntrOpRes(LHS) &&
24528         (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24529         (isNullConstant(RHS) || isOneConstant(RHS))) {
24530       SDValue Value, Overflow;
24531       X86::CondCode X86Cond;
24532       std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24533 
24534       if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24535         X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24536 
24537       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24538       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24539                          Overflow);
24540     }
24541 
24542     if (LHS.getSimpleValueType().isInteger()) {
24543       SDValue CCVal;
24544       SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24545       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24546                          EFLAGS);
24547     }
24548 
24549     if (CC == ISD::SETOEQ) {
24550       // For FCMP_OEQ, we can emit
24551       // two branches instead of an explicit AND instruction with a
24552       // separate test. However, we only do this if this block doesn't
24553       // have a fall-through edge, because this requires an explicit
24554       // jmp when the condition is false.
24555       if (Op.getNode()->hasOneUse()) {
24556         SDNode *User = *Op.getNode()->use_begin();
24557         // Look for an unconditional branch following this conditional branch.
24558         // We need this because we need to reverse the successors in order
24559         // to implement FCMP_OEQ.
24560         if (User->getOpcode() == ISD::BR) {
24561           SDValue FalseBB = User->getOperand(1);
24562           SDNode *NewBR =
24563             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24564           assert(NewBR == User);
24565           (void)NewBR;
24566           Dest = FalseBB;
24567 
24568           SDValue Cmp =
24569               DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24570           SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24571           Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24572                               CCVal, Cmp);
24573           CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24574           return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24575                              Cmp);
24576         }
24577       }
24578     } else if (CC == ISD::SETUNE) {
24579       // For FCMP_UNE, we can emit
24580       // two branches instead of an explicit OR instruction with a
24581       // separate test.
24582       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24583       SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24584       Chain =
24585           DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24586       CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24587       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24588                          Cmp);
24589     } else {
24590       X86::CondCode X86Cond =
24591           TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24592       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24593       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24594       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24595                          Cmp);
24596     }
24597   }
24598 
24599   if (ISD::isOverflowIntrOpRes(Cond)) {
24600     SDValue Value, Overflow;
24601     X86::CondCode X86Cond;
24602     std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24603 
24604     SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24605     return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24606                        Overflow);
24607   }
24608 
24609   // Look past the truncate if the high bits are known zero.
24610   if (isTruncWithZeroHighBitsInput(Cond, DAG))
24611     Cond = Cond.getOperand(0);
24612 
24613   EVT CondVT = Cond.getValueType();
24614 
24615   // Add an AND with 1 if we don't already have one.
24616   if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24617     Cond =
24618         DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24619 
24620   SDValue LHS = Cond;
24621   SDValue RHS = DAG.getConstant(0, dl, CondVT);
24622 
24623   SDValue CCVal;
24624   SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24625   return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24626                      EFLAGS);
24627 }
24628 
24629 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24630 // Calls to _alloca are needed to probe the stack when allocating more than 4k
24631 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
24632 // that the guard pages used by the OS virtual memory manager are allocated in
24633 // correct sequence.
24634 SDValue
24635 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24636                                            SelectionDAG &DAG) const {
24637   MachineFunction &MF = DAG.getMachineFunction();
24638   bool SplitStack = MF.shouldSplitStack();
24639   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24640   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24641                SplitStack || EmitStackProbeCall;
24642   SDLoc dl(Op);
24643 
24644   // Get the inputs.
24645   SDNode *Node = Op.getNode();
24646   SDValue Chain = Op.getOperand(0);
24647   SDValue Size  = Op.getOperand(1);
24648   MaybeAlign Alignment(Op.getConstantOperandVal(2));
24649   EVT VT = Node->getValueType(0);
24650 
24651   // Chain the dynamic stack allocation so that it doesn't modify the stack
24652   // pointer when other instructions are using the stack.
24653   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24654 
24655   bool Is64Bit = Subtarget.is64Bit();
24656   MVT SPTy = getPointerTy(DAG.getDataLayout());
24657 
24658   SDValue Result;
24659   if (!Lower) {
24660     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24661     Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24662     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24663                     " not tell us which reg is the stack pointer!");
24664 
24665     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24666     const Align StackAlign = TFI.getStackAlign();
24667     if (hasInlineStackProbe(MF)) {
24668       MachineRegisterInfo &MRI = MF.getRegInfo();
24669 
24670       const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24671       Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24672       Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24673       Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24674                            DAG.getRegister(Vreg, SPTy));
24675     } else {
24676       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24677       Chain = SP.getValue(1);
24678       Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24679     }
24680     if (Alignment && *Alignment > StackAlign)
24681       Result =
24682           DAG.getNode(ISD::AND, dl, VT, Result,
24683                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24684     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24685   } else if (SplitStack) {
24686     MachineRegisterInfo &MRI = MF.getRegInfo();
24687 
24688     if (Is64Bit) {
24689       // The 64 bit implementation of segmented stacks needs to clobber both r10
24690       // r11. This makes it impossible to use it along with nested parameters.
24691       const Function &F = MF.getFunction();
24692       for (const auto &A : F.args()) {
24693         if (A.hasNestAttr())
24694           report_fatal_error("Cannot use segmented stacks with functions that "
24695                              "have nested arguments.");
24696       }
24697     }
24698 
24699     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24700     Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24701     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24702     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24703                                 DAG.getRegister(Vreg, SPTy));
24704   } else {
24705     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24706     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24707     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24708 
24709     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24710     Register SPReg = RegInfo->getStackRegister();
24711     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24712     Chain = SP.getValue(1);
24713 
24714     if (Alignment) {
24715       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24716                        DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24717       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24718     }
24719 
24720     Result = SP;
24721   }
24722 
24723   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24724                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24725 
24726   SDValue Ops[2] = {Result, Chain};
24727   return DAG.getMergeValues(Ops, dl);
24728 }
24729 
24730 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24731   MachineFunction &MF = DAG.getMachineFunction();
24732   auto PtrVT = getPointerTy(MF.getDataLayout());
24733   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24734 
24735   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24736   SDLoc DL(Op);
24737 
24738   if (!Subtarget.is64Bit() ||
24739       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24740     // vastart just stores the address of the VarArgsFrameIndex slot into the
24741     // memory location argument.
24742     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24743     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24744                         MachinePointerInfo(SV));
24745   }
24746 
24747   // __va_list_tag:
24748   //   gp_offset         (0 - 6 * 8)
24749   //   fp_offset         (48 - 48 + 8 * 16)
24750   //   overflow_arg_area (point to parameters coming in memory).
24751   //   reg_save_area
24752   SmallVector<SDValue, 8> MemOps;
24753   SDValue FIN = Op.getOperand(1);
24754   // Store gp_offset
24755   SDValue Store = DAG.getStore(
24756       Op.getOperand(0), DL,
24757       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24758       MachinePointerInfo(SV));
24759   MemOps.push_back(Store);
24760 
24761   // Store fp_offset
24762   FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24763   Store = DAG.getStore(
24764       Op.getOperand(0), DL,
24765       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24766       MachinePointerInfo(SV, 4));
24767   MemOps.push_back(Store);
24768 
24769   // Store ptr to overflow_arg_area
24770   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24771   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24772   Store =
24773       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24774   MemOps.push_back(Store);
24775 
24776   // Store ptr to reg_save_area.
24777   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24778       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24779   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24780   Store = DAG.getStore(
24781       Op.getOperand(0), DL, RSFIN, FIN,
24782       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24783   MemOps.push_back(Store);
24784   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24785 }
24786 
24787 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24788   assert(Subtarget.is64Bit() &&
24789          "LowerVAARG only handles 64-bit va_arg!");
24790   assert(Op.getNumOperands() == 4);
24791 
24792   MachineFunction &MF = DAG.getMachineFunction();
24793   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24794     // The Win64 ABI uses char* instead of a structure.
24795     return DAG.expandVAArg(Op.getNode());
24796 
24797   SDValue Chain = Op.getOperand(0);
24798   SDValue SrcPtr = Op.getOperand(1);
24799   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24800   unsigned Align = Op.getConstantOperandVal(3);
24801   SDLoc dl(Op);
24802 
24803   EVT ArgVT = Op.getNode()->getValueType(0);
24804   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24805   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24806   uint8_t ArgMode;
24807 
24808   // Decide which area this value should be read from.
24809   // TODO: Implement the AMD64 ABI in its entirety. This simple
24810   // selection mechanism works only for the basic types.
24811   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
24812   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24813     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
24814   } else {
24815     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
24816            "Unhandled argument type in LowerVAARG");
24817     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
24818   }
24819 
24820   if (ArgMode == 2) {
24821     // Sanity Check: Make sure using fp_offset makes sense.
24822     assert(!Subtarget.useSoftFloat() &&
24823            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
24824            Subtarget.hasSSE1());
24825   }
24826 
24827   // Insert VAARG node into the DAG
24828   // VAARG returns two values: Variable Argument Address, Chain
24829   SDValue InstOps[] = {Chain, SrcPtr,
24830                        DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24831                        DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24832                        DAG.getTargetConstant(Align, dl, MVT::i32)};
24833   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24834   SDValue VAARG = DAG.getMemIntrinsicNode(
24835       Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24836       VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24837       /*Alignment=*/None,
24838       MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24839   Chain = VAARG.getValue(1);
24840 
24841   // Load the next argument and return it
24842   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24843 }
24844 
24845 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24846                            SelectionDAG &DAG) {
24847   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24848   // where a va_list is still an i8*.
24849   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
24850   if (Subtarget.isCallingConvWin64(
24851         DAG.getMachineFunction().getFunction().getCallingConv()))
24852     // Probably a Win64 va_copy.
24853     return DAG.expandVACopy(Op.getNode());
24854 
24855   SDValue Chain = Op.getOperand(0);
24856   SDValue DstPtr = Op.getOperand(1);
24857   SDValue SrcPtr = Op.getOperand(2);
24858   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24859   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24860   SDLoc DL(Op);
24861 
24862   return DAG.getMemcpy(
24863       Chain, DL, DstPtr, SrcPtr,
24864       DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24865       Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24866       false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24867 }
24868 
24869 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24870 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24871   switch (Opc) {
24872   case ISD::SHL:
24873   case X86ISD::VSHL:
24874   case X86ISD::VSHLI:
24875     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24876   case ISD::SRL:
24877   case X86ISD::VSRL:
24878   case X86ISD::VSRLI:
24879     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24880   case ISD::SRA:
24881   case X86ISD::VSRA:
24882   case X86ISD::VSRAI:
24883     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24884   }
24885   llvm_unreachable("Unknown target vector shift node");
24886 }
24887 
24888 /// Handle vector element shifts where the shift amount is a constant.
24889 /// Takes immediate version of shift as input.
24890 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24891                                           SDValue SrcOp, uint64_t ShiftAmt,
24892                                           SelectionDAG &DAG) {
24893   MVT ElementType = VT.getVectorElementType();
24894 
24895   // Bitcast the source vector to the output type, this is mainly necessary for
24896   // vXi8/vXi64 shifts.
24897   if (VT != SrcOp.getSimpleValueType())
24898     SrcOp = DAG.getBitcast(VT, SrcOp);
24899 
24900   // Fold this packed shift into its first operand if ShiftAmt is 0.
24901   if (ShiftAmt == 0)
24902     return SrcOp;
24903 
24904   // Check for ShiftAmt >= element width
24905   if (ShiftAmt >= ElementType.getSizeInBits()) {
24906     if (Opc == X86ISD::VSRAI)
24907       ShiftAmt = ElementType.getSizeInBits() - 1;
24908     else
24909       return DAG.getConstant(0, dl, VT);
24910   }
24911 
24912   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
24913          && "Unknown target vector shift-by-constant node");
24914 
24915   // Fold this packed vector shift into a build vector if SrcOp is a
24916   // vector of Constants or UNDEFs.
24917   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24918     SmallVector<SDValue, 8> Elts;
24919     unsigned NumElts = SrcOp->getNumOperands();
24920 
24921     switch (Opc) {
24922     default: llvm_unreachable("Unknown opcode!");
24923     case X86ISD::VSHLI:
24924       for (unsigned i = 0; i != NumElts; ++i) {
24925         SDValue CurrentOp = SrcOp->getOperand(i);
24926         if (CurrentOp->isUndef()) {
24927           // Must produce 0s in the correct bits.
24928           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24929           continue;
24930         }
24931         auto *ND = cast<ConstantSDNode>(CurrentOp);
24932         const APInt &C = ND->getAPIntValue();
24933         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24934       }
24935       break;
24936     case X86ISD::VSRLI:
24937       for (unsigned i = 0; i != NumElts; ++i) {
24938         SDValue CurrentOp = SrcOp->getOperand(i);
24939         if (CurrentOp->isUndef()) {
24940           // Must produce 0s in the correct bits.
24941           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24942           continue;
24943         }
24944         auto *ND = cast<ConstantSDNode>(CurrentOp);
24945         const APInt &C = ND->getAPIntValue();
24946         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24947       }
24948       break;
24949     case X86ISD::VSRAI:
24950       for (unsigned i = 0; i != NumElts; ++i) {
24951         SDValue CurrentOp = SrcOp->getOperand(i);
24952         if (CurrentOp->isUndef()) {
24953           // All shifted in bits must be the same so use 0.
24954           Elts.push_back(DAG.getConstant(0, dl, ElementType));
24955           continue;
24956         }
24957         auto *ND = cast<ConstantSDNode>(CurrentOp);
24958         const APInt &C = ND->getAPIntValue();
24959         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24960       }
24961       break;
24962     }
24963 
24964     return DAG.getBuildVector(VT, dl, Elts);
24965   }
24966 
24967   return DAG.getNode(Opc, dl, VT, SrcOp,
24968                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24969 }
24970 
24971 /// Handle vector element shifts where the shift amount may or may not be a
24972 /// constant. Takes immediate version of shift as input.
24973 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24974                                    SDValue SrcOp, SDValue ShAmt,
24975                                    const X86Subtarget &Subtarget,
24976                                    SelectionDAG &DAG) {
24977   MVT SVT = ShAmt.getSimpleValueType();
24978   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
24979 
24980   // Catch shift-by-constant.
24981   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24982     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24983                                       CShAmt->getZExtValue(), DAG);
24984 
24985   // Change opcode to non-immediate version.
24986   Opc = getTargetVShiftUniformOpcode(Opc, true);
24987 
24988   // Need to build a vector containing shift amount.
24989   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24990   // +====================+============+=======================================+
24991   // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
24992   // +====================+============+=======================================+
24993   // | i64                | Yes, No    | Use ShAmt as lowest elt               |
24994   // | i32                | Yes        | zero-extend in-reg                    |
24995   // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
24996   // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
24997   // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24998   // +====================+============+=======================================+
24999 
25000   if (SVT == MVT::i64)
25001     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25002   else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25003            ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25004            (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25005             ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25006     ShAmt = ShAmt.getOperand(0);
25007     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25008     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25009     if (Subtarget.hasSSE41())
25010       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25011                           MVT::v2i64, ShAmt);
25012     else {
25013       SDValue ByteShift = DAG.getTargetConstant(
25014           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25015       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25016       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25017                           ByteShift);
25018       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25019                           ByteShift);
25020     }
25021   } else if (Subtarget.hasSSE41() &&
25022              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25023     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25024     ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25025                         MVT::v2i64, ShAmt);
25026   } else {
25027     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25028                         DAG.getUNDEF(SVT)};
25029     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25030   }
25031 
25032   // The return type has to be a 128-bit type with the same element
25033   // type as the input type.
25034   MVT EltVT = VT.getVectorElementType();
25035   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25036 
25037   ShAmt = DAG.getBitcast(ShVT, ShAmt);
25038   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25039 }
25040 
25041 /// Return Mask with the necessary casting or extending
25042 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
25043 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25044                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
25045                            const SDLoc &dl) {
25046 
25047   if (isAllOnesConstant(Mask))
25048     return DAG.getConstant(1, dl, MaskVT);
25049   if (X86::isZeroNode(Mask))
25050     return DAG.getConstant(0, dl, MaskVT);
25051 
25052   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25053 
25054   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25055     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25056     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25057     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25058     SDValue Lo, Hi;
25059     Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25060                         DAG.getConstant(0, dl, MVT::i32));
25061     Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25062                         DAG.getConstant(1, dl, MVT::i32));
25063 
25064     Lo = DAG.getBitcast(MVT::v32i1, Lo);
25065     Hi = DAG.getBitcast(MVT::v32i1, Hi);
25066 
25067     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25068   } else {
25069     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25070                                      Mask.getSimpleValueType().getSizeInBits());
25071     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25072     // are extracted by EXTRACT_SUBVECTOR.
25073     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25074                        DAG.getBitcast(BitcastVT, Mask),
25075                        DAG.getIntPtrConstant(0, dl));
25076   }
25077 }
25078 
25079 /// Return (and \p Op, \p Mask) for compare instructions or
25080 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25081 /// necessary casting or extending for \p Mask when lowering masking intrinsics
25082 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25083                   SDValue PreservedSrc,
25084                   const X86Subtarget &Subtarget,
25085                   SelectionDAG &DAG) {
25086   MVT VT = Op.getSimpleValueType();
25087   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25088   unsigned OpcodeSelect = ISD::VSELECT;
25089   SDLoc dl(Op);
25090 
25091   if (isAllOnesConstant(Mask))
25092     return Op;
25093 
25094   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25095 
25096   if (PreservedSrc.isUndef())
25097     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25098   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25099 }
25100 
25101 /// Creates an SDNode for a predicated scalar operation.
25102 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25103 /// The mask is coming as MVT::i8 and it should be transformed
25104 /// to MVT::v1i1 while lowering masking intrinsics.
25105 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25106 /// "X86select" instead of "vselect". We just can't create the "vselect" node
25107 /// for a scalar instruction.
25108 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25109                                     SDValue PreservedSrc,
25110                                     const X86Subtarget &Subtarget,
25111                                     SelectionDAG &DAG) {
25112 
25113   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25114     if (MaskConst->getZExtValue() & 0x1)
25115       return Op;
25116 
25117   MVT VT = Op.getSimpleValueType();
25118   SDLoc dl(Op);
25119 
25120   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25121   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25122                               DAG.getBitcast(MVT::v8i1, Mask),
25123                               DAG.getIntPtrConstant(0, dl));
25124   if (Op.getOpcode() == X86ISD::FSETCCM ||
25125       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25126       Op.getOpcode() == X86ISD::VFPCLASSS)
25127     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25128 
25129   if (PreservedSrc.isUndef())
25130     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25131   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25132 }
25133 
25134 static int getSEHRegistrationNodeSize(const Function *Fn) {
25135   if (!Fn->hasPersonalityFn())
25136     report_fatal_error(
25137         "querying registration node size for function without personality");
25138   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25139   // WinEHStatePass for the full struct definition.
25140   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25141   case EHPersonality::MSVC_X86SEH: return 24;
25142   case EHPersonality::MSVC_CXX: return 16;
25143   default: break;
25144   }
25145   report_fatal_error(
25146       "can only recover FP for 32-bit MSVC EH personality functions");
25147 }
25148 
25149 /// When the MSVC runtime transfers control to us, either to an outlined
25150 /// function or when returning to a parent frame after catching an exception, we
25151 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25152 /// Here's the math:
25153 ///   RegNodeBase = EntryEBP - RegNodeSize
25154 ///   ParentFP = RegNodeBase - ParentFrameOffset
25155 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
25156 /// subtracting the offset (negative on x86) takes us back to the parent FP.
25157 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25158                                    SDValue EntryEBP) {
25159   MachineFunction &MF = DAG.getMachineFunction();
25160   SDLoc dl;
25161 
25162   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25163   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25164 
25165   // It's possible that the parent function no longer has a personality function
25166   // if the exceptional code was optimized away, in which case we just return
25167   // the incoming EBP.
25168   if (!Fn->hasPersonalityFn())
25169     return EntryEBP;
25170 
25171   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25172   // registration, or the .set_setframe offset.
25173   MCSymbol *OffsetSym =
25174       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25175           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25176   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25177   SDValue ParentFrameOffset =
25178       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25179 
25180   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25181   // prologue to RBP in the parent function.
25182   const X86Subtarget &Subtarget =
25183       static_cast<const X86Subtarget &>(DAG.getSubtarget());
25184   if (Subtarget.is64Bit())
25185     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25186 
25187   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25188   // RegNodeBase = EntryEBP - RegNodeSize
25189   // ParentFP = RegNodeBase - ParentFrameOffset
25190   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25191                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
25192   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25193 }
25194 
25195 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25196                                                    SelectionDAG &DAG) const {
25197   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25198   auto isRoundModeCurDirection = [](SDValue Rnd) {
25199     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25200       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25201 
25202     return false;
25203   };
25204   auto isRoundModeSAE = [](SDValue Rnd) {
25205     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25206       unsigned RC = C->getZExtValue();
25207       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25208         // Clear the NO_EXC bit and check remaining bits.
25209         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25210         // As a convenience we allow no other bits or explicitly
25211         // current direction.
25212         return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25213       }
25214     }
25215 
25216     return false;
25217   };
25218   auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25219     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25220       RC = C->getZExtValue();
25221       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25222         // Clear the NO_EXC bit and check remaining bits.
25223         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25224         return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25225                RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25226                RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25227                RC == X86::STATIC_ROUNDING::TO_ZERO;
25228       }
25229     }
25230 
25231     return false;
25232   };
25233 
25234   SDLoc dl(Op);
25235   unsigned IntNo = Op.getConstantOperandVal(0);
25236   MVT VT = Op.getSimpleValueType();
25237   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25238 
25239   // Propagate flags from original node to transformed node(s).
25240   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25241 
25242   if (IntrData) {
25243     switch(IntrData->Type) {
25244     case INTR_TYPE_1OP: {
25245       // We specify 2 possible opcodes for intrinsics with rounding modes.
25246       // First, we check if the intrinsic may have non-default rounding mode,
25247       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25248       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25249       if (IntrWithRoundingModeOpcode != 0) {
25250         SDValue Rnd = Op.getOperand(2);
25251         unsigned RC = 0;
25252         if (isRoundModeSAEToX(Rnd, RC))
25253           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25254                              Op.getOperand(1),
25255                              DAG.getTargetConstant(RC, dl, MVT::i32));
25256         if (!isRoundModeCurDirection(Rnd))
25257           return SDValue();
25258       }
25259       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25260                          Op.getOperand(1));
25261     }
25262     case INTR_TYPE_1OP_SAE: {
25263       SDValue Sae = Op.getOperand(2);
25264 
25265       unsigned Opc;
25266       if (isRoundModeCurDirection(Sae))
25267         Opc = IntrData->Opc0;
25268       else if (isRoundModeSAE(Sae))
25269         Opc = IntrData->Opc1;
25270       else
25271         return SDValue();
25272 
25273       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25274     }
25275     case INTR_TYPE_2OP: {
25276       SDValue Src2 = Op.getOperand(2);
25277 
25278       // We specify 2 possible opcodes for intrinsics with rounding modes.
25279       // First, we check if the intrinsic may have non-default rounding mode,
25280       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25281       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25282       if (IntrWithRoundingModeOpcode != 0) {
25283         SDValue Rnd = Op.getOperand(3);
25284         unsigned RC = 0;
25285         if (isRoundModeSAEToX(Rnd, RC))
25286           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25287                              Op.getOperand(1), Src2,
25288                              DAG.getTargetConstant(RC, dl, MVT::i32));
25289         if (!isRoundModeCurDirection(Rnd))
25290           return SDValue();
25291       }
25292 
25293       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25294                          Op.getOperand(1), Src2);
25295     }
25296     case INTR_TYPE_2OP_SAE: {
25297       SDValue Sae = Op.getOperand(3);
25298 
25299       unsigned Opc;
25300       if (isRoundModeCurDirection(Sae))
25301         Opc = IntrData->Opc0;
25302       else if (isRoundModeSAE(Sae))
25303         Opc = IntrData->Opc1;
25304       else
25305         return SDValue();
25306 
25307       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25308                          Op.getOperand(2));
25309     }
25310     case INTR_TYPE_3OP:
25311     case INTR_TYPE_3OP_IMM8: {
25312       SDValue Src1 = Op.getOperand(1);
25313       SDValue Src2 = Op.getOperand(2);
25314       SDValue Src3 = Op.getOperand(3);
25315 
25316       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25317           Src3.getValueType() != MVT::i8) {
25318         Src3 = DAG.getTargetConstant(
25319             cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25320       }
25321 
25322       // We specify 2 possible opcodes for intrinsics with rounding modes.
25323       // First, we check if the intrinsic may have non-default rounding mode,
25324       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25325       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25326       if (IntrWithRoundingModeOpcode != 0) {
25327         SDValue Rnd = Op.getOperand(4);
25328         unsigned RC = 0;
25329         if (isRoundModeSAEToX(Rnd, RC))
25330           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25331                              Src1, Src2, Src3,
25332                              DAG.getTargetConstant(RC, dl, MVT::i32));
25333         if (!isRoundModeCurDirection(Rnd))
25334           return SDValue();
25335       }
25336 
25337       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25338                          {Src1, Src2, Src3});
25339     }
25340     case INTR_TYPE_4OP_IMM8: {
25341       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25342       SDValue Src4 = Op.getOperand(4);
25343       if (Src4.getValueType() != MVT::i8) {
25344         Src4 = DAG.getTargetConstant(
25345             cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25346       }
25347 
25348       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25349                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25350                          Src4);
25351     }
25352     case INTR_TYPE_1OP_MASK: {
25353       SDValue Src = Op.getOperand(1);
25354       SDValue PassThru = Op.getOperand(2);
25355       SDValue Mask = Op.getOperand(3);
25356       // We add rounding mode to the Node when
25357       //   - RC Opcode is specified and
25358       //   - RC is not "current direction".
25359       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25360       if (IntrWithRoundingModeOpcode != 0) {
25361         SDValue Rnd = Op.getOperand(4);
25362         unsigned RC = 0;
25363         if (isRoundModeSAEToX(Rnd, RC))
25364           return getVectorMaskingNode(
25365               DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25366                           Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25367               Mask, PassThru, Subtarget, DAG);
25368         if (!isRoundModeCurDirection(Rnd))
25369           return SDValue();
25370       }
25371       return getVectorMaskingNode(
25372           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25373           Subtarget, DAG);
25374     }
25375     case INTR_TYPE_1OP_MASK_SAE: {
25376       SDValue Src = Op.getOperand(1);
25377       SDValue PassThru = Op.getOperand(2);
25378       SDValue Mask = Op.getOperand(3);
25379       SDValue Rnd = Op.getOperand(4);
25380 
25381       unsigned Opc;
25382       if (isRoundModeCurDirection(Rnd))
25383         Opc = IntrData->Opc0;
25384       else if (isRoundModeSAE(Rnd))
25385         Opc = IntrData->Opc1;
25386       else
25387         return SDValue();
25388 
25389       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25390                                   Subtarget, DAG);
25391     }
25392     case INTR_TYPE_SCALAR_MASK: {
25393       SDValue Src1 = Op.getOperand(1);
25394       SDValue Src2 = Op.getOperand(2);
25395       SDValue passThru = Op.getOperand(3);
25396       SDValue Mask = Op.getOperand(4);
25397       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25398       // There are 2 kinds of intrinsics in this group:
25399       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25400       // (2) With rounding mode and sae - 7 operands.
25401       bool HasRounding = IntrWithRoundingModeOpcode != 0;
25402       if (Op.getNumOperands() == (5U + HasRounding)) {
25403         if (HasRounding) {
25404           SDValue Rnd = Op.getOperand(5);
25405           unsigned RC = 0;
25406           if (isRoundModeSAEToX(Rnd, RC))
25407             return getScalarMaskingNode(
25408                 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25409                             DAG.getTargetConstant(RC, dl, MVT::i32)),
25410                 Mask, passThru, Subtarget, DAG);
25411           if (!isRoundModeCurDirection(Rnd))
25412             return SDValue();
25413         }
25414         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25415                                                 Src2),
25416                                     Mask, passThru, Subtarget, DAG);
25417       }
25418 
25419       assert(Op.getNumOperands() == (6U + HasRounding) &&
25420              "Unexpected intrinsic form");
25421       SDValue RoundingMode = Op.getOperand(5);
25422       unsigned Opc = IntrData->Opc0;
25423       if (HasRounding) {
25424         SDValue Sae = Op.getOperand(6);
25425         if (isRoundModeSAE(Sae))
25426           Opc = IntrWithRoundingModeOpcode;
25427         else if (!isRoundModeCurDirection(Sae))
25428           return SDValue();
25429       }
25430       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25431                                               Src2, RoundingMode),
25432                                   Mask, passThru, Subtarget, DAG);
25433     }
25434     case INTR_TYPE_SCALAR_MASK_RND: {
25435       SDValue Src1 = Op.getOperand(1);
25436       SDValue Src2 = Op.getOperand(2);
25437       SDValue passThru = Op.getOperand(3);
25438       SDValue Mask = Op.getOperand(4);
25439       SDValue Rnd = Op.getOperand(5);
25440 
25441       SDValue NewOp;
25442       unsigned RC = 0;
25443       if (isRoundModeCurDirection(Rnd))
25444         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25445       else if (isRoundModeSAEToX(Rnd, RC))
25446         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25447                             DAG.getTargetConstant(RC, dl, MVT::i32));
25448       else
25449         return SDValue();
25450 
25451       return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25452     }
25453     case INTR_TYPE_SCALAR_MASK_SAE: {
25454       SDValue Src1 = Op.getOperand(1);
25455       SDValue Src2 = Op.getOperand(2);
25456       SDValue passThru = Op.getOperand(3);
25457       SDValue Mask = Op.getOperand(4);
25458       SDValue Sae = Op.getOperand(5);
25459       unsigned Opc;
25460       if (isRoundModeCurDirection(Sae))
25461         Opc = IntrData->Opc0;
25462       else if (isRoundModeSAE(Sae))
25463         Opc = IntrData->Opc1;
25464       else
25465         return SDValue();
25466 
25467       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25468                                   Mask, passThru, Subtarget, DAG);
25469     }
25470     case INTR_TYPE_2OP_MASK: {
25471       SDValue Src1 = Op.getOperand(1);
25472       SDValue Src2 = Op.getOperand(2);
25473       SDValue PassThru = Op.getOperand(3);
25474       SDValue Mask = Op.getOperand(4);
25475       SDValue NewOp;
25476       if (IntrData->Opc1 != 0) {
25477         SDValue Rnd = Op.getOperand(5);
25478         unsigned RC = 0;
25479         if (isRoundModeSAEToX(Rnd, RC))
25480           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25481                               DAG.getTargetConstant(RC, dl, MVT::i32));
25482         else if (!isRoundModeCurDirection(Rnd))
25483           return SDValue();
25484       }
25485       if (!NewOp)
25486         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25487       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25488     }
25489     case INTR_TYPE_2OP_MASK_SAE: {
25490       SDValue Src1 = Op.getOperand(1);
25491       SDValue Src2 = Op.getOperand(2);
25492       SDValue PassThru = Op.getOperand(3);
25493       SDValue Mask = Op.getOperand(4);
25494 
25495       unsigned Opc = IntrData->Opc0;
25496       if (IntrData->Opc1 != 0) {
25497         SDValue Sae = Op.getOperand(5);
25498         if (isRoundModeSAE(Sae))
25499           Opc = IntrData->Opc1;
25500         else if (!isRoundModeCurDirection(Sae))
25501           return SDValue();
25502       }
25503 
25504       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25505                                   Mask, PassThru, Subtarget, DAG);
25506     }
25507     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25508       SDValue Src1 = Op.getOperand(1);
25509       SDValue Src2 = Op.getOperand(2);
25510       SDValue Src3 = Op.getOperand(3);
25511       SDValue PassThru = Op.getOperand(4);
25512       SDValue Mask = Op.getOperand(5);
25513       SDValue Sae = Op.getOperand(6);
25514       unsigned Opc;
25515       if (isRoundModeCurDirection(Sae))
25516         Opc = IntrData->Opc0;
25517       else if (isRoundModeSAE(Sae))
25518         Opc = IntrData->Opc1;
25519       else
25520         return SDValue();
25521 
25522       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25523                                   Mask, PassThru, Subtarget, DAG);
25524     }
25525     case INTR_TYPE_3OP_MASK_SAE: {
25526       SDValue Src1 = Op.getOperand(1);
25527       SDValue Src2 = Op.getOperand(2);
25528       SDValue Src3 = Op.getOperand(3);
25529       SDValue PassThru = Op.getOperand(4);
25530       SDValue Mask = Op.getOperand(5);
25531 
25532       unsigned Opc = IntrData->Opc0;
25533       if (IntrData->Opc1 != 0) {
25534         SDValue Sae = Op.getOperand(6);
25535         if (isRoundModeSAE(Sae))
25536           Opc = IntrData->Opc1;
25537         else if (!isRoundModeCurDirection(Sae))
25538           return SDValue();
25539       }
25540       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25541                                   Mask, PassThru, Subtarget, DAG);
25542     }
25543     case BLENDV: {
25544       SDValue Src1 = Op.getOperand(1);
25545       SDValue Src2 = Op.getOperand(2);
25546       SDValue Src3 = Op.getOperand(3);
25547 
25548       EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25549       Src3 = DAG.getBitcast(MaskVT, Src3);
25550 
25551       // Reverse the operands to match VSELECT order.
25552       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25553     }
25554     case VPERM_2OP : {
25555       SDValue Src1 = Op.getOperand(1);
25556       SDValue Src2 = Op.getOperand(2);
25557 
25558       // Swap Src1 and Src2 in the node creation
25559       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25560     }
25561     case IFMA_OP:
25562       // NOTE: We need to swizzle the operands to pass the multiply operands
25563       // first.
25564       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25565                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25566     case FPCLASSS: {
25567       SDValue Src1 = Op.getOperand(1);
25568       SDValue Imm = Op.getOperand(2);
25569       SDValue Mask = Op.getOperand(3);
25570       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25571       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25572                                                  Subtarget, DAG);
25573       // Need to fill with zeros to ensure the bitcast will produce zeroes
25574       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25575       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25576                                 DAG.getConstant(0, dl, MVT::v8i1),
25577                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
25578       return DAG.getBitcast(MVT::i8, Ins);
25579     }
25580 
25581     case CMP_MASK_CC: {
25582       MVT MaskVT = Op.getSimpleValueType();
25583       SDValue CC = Op.getOperand(3);
25584       SDValue Mask = Op.getOperand(4);
25585       // We specify 2 possible opcodes for intrinsics with rounding modes.
25586       // First, we check if the intrinsic may have non-default rounding mode,
25587       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25588       if (IntrData->Opc1 != 0) {
25589         SDValue Sae = Op.getOperand(5);
25590         if (isRoundModeSAE(Sae))
25591           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25592                              Op.getOperand(2), CC, Mask, Sae);
25593         if (!isRoundModeCurDirection(Sae))
25594           return SDValue();
25595       }
25596       //default rounding mode
25597       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25598                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25599     }
25600     case CMP_MASK_SCALAR_CC: {
25601       SDValue Src1 = Op.getOperand(1);
25602       SDValue Src2 = Op.getOperand(2);
25603       SDValue CC = Op.getOperand(3);
25604       SDValue Mask = Op.getOperand(4);
25605 
25606       SDValue Cmp;
25607       if (IntrData->Opc1 != 0) {
25608         SDValue Sae = Op.getOperand(5);
25609         if (isRoundModeSAE(Sae))
25610           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25611         else if (!isRoundModeCurDirection(Sae))
25612           return SDValue();
25613       }
25614       //default rounding mode
25615       if (!Cmp.getNode())
25616         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25617 
25618       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25619                                              Subtarget, DAG);
25620       // Need to fill with zeros to ensure the bitcast will produce zeroes
25621       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25622       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25623                                 DAG.getConstant(0, dl, MVT::v8i1),
25624                                 CmpMask, DAG.getIntPtrConstant(0, dl));
25625       return DAG.getBitcast(MVT::i8, Ins);
25626     }
25627     case COMI: { // Comparison intrinsics
25628       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25629       SDValue LHS = Op.getOperand(1);
25630       SDValue RHS = Op.getOperand(2);
25631       // Some conditions require the operands to be swapped.
25632       if (CC == ISD::SETLT || CC == ISD::SETLE)
25633         std::swap(LHS, RHS);
25634 
25635       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25636       SDValue SetCC;
25637       switch (CC) {
25638       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25639         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25640         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25641         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25642         break;
25643       }
25644       case ISD::SETNE: { // (ZF = 1 or PF = 1)
25645         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25646         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25647         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25648         break;
25649       }
25650       case ISD::SETGT: // (CF = 0 and ZF = 0)
25651       case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25652         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25653         break;
25654       }
25655       case ISD::SETGE: // CF = 0
25656       case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25657         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25658         break;
25659       default:
25660         llvm_unreachable("Unexpected illegal condition!");
25661       }
25662       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25663     }
25664     case COMI_RM: { // Comparison intrinsics with Sae
25665       SDValue LHS = Op.getOperand(1);
25666       SDValue RHS = Op.getOperand(2);
25667       unsigned CondVal = Op.getConstantOperandVal(3);
25668       SDValue Sae = Op.getOperand(4);
25669 
25670       SDValue FCmp;
25671       if (isRoundModeCurDirection(Sae))
25672         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25673                            DAG.getTargetConstant(CondVal, dl, MVT::i8));
25674       else if (isRoundModeSAE(Sae))
25675         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25676                            DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25677       else
25678         return SDValue();
25679       // Need to fill with zeros to ensure the bitcast will produce zeroes
25680       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25681       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25682                                 DAG.getConstant(0, dl, MVT::v16i1),
25683                                 FCmp, DAG.getIntPtrConstant(0, dl));
25684       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25685                          DAG.getBitcast(MVT::i16, Ins));
25686     }
25687     case VSHIFT:
25688       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25689                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
25690                                  DAG);
25691     case COMPRESS_EXPAND_IN_REG: {
25692       SDValue Mask = Op.getOperand(3);
25693       SDValue DataToCompress = Op.getOperand(1);
25694       SDValue PassThru = Op.getOperand(2);
25695       if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25696         return Op.getOperand(1);
25697 
25698       // Avoid false dependency.
25699       if (PassThru.isUndef())
25700         PassThru = DAG.getConstant(0, dl, VT);
25701 
25702       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25703                          Mask);
25704     }
25705     case FIXUPIMM:
25706     case FIXUPIMM_MASKZ: {
25707       SDValue Src1 = Op.getOperand(1);
25708       SDValue Src2 = Op.getOperand(2);
25709       SDValue Src3 = Op.getOperand(3);
25710       SDValue Imm = Op.getOperand(4);
25711       SDValue Mask = Op.getOperand(5);
25712       SDValue Passthru = (IntrData->Type == FIXUPIMM)
25713                              ? Src1
25714                              : getZeroVector(VT, Subtarget, DAG, dl);
25715 
25716       unsigned Opc = IntrData->Opc0;
25717       if (IntrData->Opc1 != 0) {
25718         SDValue Sae = Op.getOperand(6);
25719         if (isRoundModeSAE(Sae))
25720           Opc = IntrData->Opc1;
25721         else if (!isRoundModeCurDirection(Sae))
25722           return SDValue();
25723       }
25724 
25725       SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25726 
25727       if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25728         return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25729 
25730       return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25731     }
25732     case ROUNDP: {
25733       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25734       // Clear the upper bits of the rounding immediate so that the legacy
25735       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25736       auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25737       SDValue RoundingMode =
25738           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25739       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25740                          Op.getOperand(1), RoundingMode);
25741     }
25742     case ROUNDS: {
25743       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25744       // Clear the upper bits of the rounding immediate so that the legacy
25745       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25746       auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25747       SDValue RoundingMode =
25748           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25749       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25750                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
25751     }
25752     case BEXTRI: {
25753       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
25754 
25755       uint64_t Imm = Op.getConstantOperandVal(2);
25756       SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25757                                               Op.getValueType());
25758       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25759                          Op.getOperand(1), Control);
25760     }
25761     // ADC/ADCX/SBB
25762     case ADX: {
25763       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25764       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25765 
25766       SDValue Res;
25767       // If the carry in is zero, then we should just use ADD/SUB instead of
25768       // ADC/SBB.
25769       if (isNullConstant(Op.getOperand(1))) {
25770         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25771                           Op.getOperand(3));
25772       } else {
25773         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25774                                     DAG.getConstant(-1, dl, MVT::i8));
25775         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25776                           Op.getOperand(3), GenCF.getValue(1));
25777       }
25778       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25779       SDValue Results[] = { SetCC, Res };
25780       return DAG.getMergeValues(Results, dl);
25781     }
25782     case CVTPD2PS_MASK:
25783     case CVTPD2DQ_MASK:
25784     case CVTQQ2PS_MASK:
25785     case TRUNCATE_TO_REG: {
25786       SDValue Src = Op.getOperand(1);
25787       SDValue PassThru = Op.getOperand(2);
25788       SDValue Mask = Op.getOperand(3);
25789 
25790       if (isAllOnesConstant(Mask))
25791         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25792 
25793       MVT SrcVT = Src.getSimpleValueType();
25794       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25795       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25796       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25797                          {Src, PassThru, Mask});
25798     }
25799     case CVTPS2PH_MASK: {
25800       SDValue Src = Op.getOperand(1);
25801       SDValue Rnd = Op.getOperand(2);
25802       SDValue PassThru = Op.getOperand(3);
25803       SDValue Mask = Op.getOperand(4);
25804 
25805       if (isAllOnesConstant(Mask))
25806         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25807 
25808       MVT SrcVT = Src.getSimpleValueType();
25809       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25810       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25811       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25812                          PassThru, Mask);
25813 
25814     }
25815     case CVTNEPS2BF16_MASK: {
25816       SDValue Src = Op.getOperand(1);
25817       SDValue PassThru = Op.getOperand(2);
25818       SDValue Mask = Op.getOperand(3);
25819 
25820       if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25821         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25822 
25823       // Break false dependency.
25824       if (PassThru.isUndef())
25825         PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25826 
25827       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25828                          Mask);
25829     }
25830     default:
25831       break;
25832     }
25833   }
25834 
25835   switch (IntNo) {
25836   default: return SDValue();    // Don't custom lower most intrinsics.
25837 
25838   // ptest and testp intrinsics. The intrinsic these come from are designed to
25839   // return an integer value, not just an instruction so lower it to the ptest
25840   // or testp pattern and a setcc for the result.
25841   case Intrinsic::x86_avx512_ktestc_b:
25842   case Intrinsic::x86_avx512_ktestc_w:
25843   case Intrinsic::x86_avx512_ktestc_d:
25844   case Intrinsic::x86_avx512_ktestc_q:
25845   case Intrinsic::x86_avx512_ktestz_b:
25846   case Intrinsic::x86_avx512_ktestz_w:
25847   case Intrinsic::x86_avx512_ktestz_d:
25848   case Intrinsic::x86_avx512_ktestz_q:
25849   case Intrinsic::x86_sse41_ptestz:
25850   case Intrinsic::x86_sse41_ptestc:
25851   case Intrinsic::x86_sse41_ptestnzc:
25852   case Intrinsic::x86_avx_ptestz_256:
25853   case Intrinsic::x86_avx_ptestc_256:
25854   case Intrinsic::x86_avx_ptestnzc_256:
25855   case Intrinsic::x86_avx_vtestz_ps:
25856   case Intrinsic::x86_avx_vtestc_ps:
25857   case Intrinsic::x86_avx_vtestnzc_ps:
25858   case Intrinsic::x86_avx_vtestz_pd:
25859   case Intrinsic::x86_avx_vtestc_pd:
25860   case Intrinsic::x86_avx_vtestnzc_pd:
25861   case Intrinsic::x86_avx_vtestz_ps_256:
25862   case Intrinsic::x86_avx_vtestc_ps_256:
25863   case Intrinsic::x86_avx_vtestnzc_ps_256:
25864   case Intrinsic::x86_avx_vtestz_pd_256:
25865   case Intrinsic::x86_avx_vtestc_pd_256:
25866   case Intrinsic::x86_avx_vtestnzc_pd_256: {
25867     unsigned TestOpc = X86ISD::PTEST;
25868     X86::CondCode X86CC;
25869     switch (IntNo) {
25870     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
25871     case Intrinsic::x86_avx512_ktestc_b:
25872     case Intrinsic::x86_avx512_ktestc_w:
25873     case Intrinsic::x86_avx512_ktestc_d:
25874     case Intrinsic::x86_avx512_ktestc_q:
25875       // CF = 1
25876       TestOpc = X86ISD::KTEST;
25877       X86CC = X86::COND_B;
25878       break;
25879     case Intrinsic::x86_avx512_ktestz_b:
25880     case Intrinsic::x86_avx512_ktestz_w:
25881     case Intrinsic::x86_avx512_ktestz_d:
25882     case Intrinsic::x86_avx512_ktestz_q:
25883       TestOpc = X86ISD::KTEST;
25884       X86CC = X86::COND_E;
25885       break;
25886     case Intrinsic::x86_avx_vtestz_ps:
25887     case Intrinsic::x86_avx_vtestz_pd:
25888     case Intrinsic::x86_avx_vtestz_ps_256:
25889     case Intrinsic::x86_avx_vtestz_pd_256:
25890       TestOpc = X86ISD::TESTP;
25891       LLVM_FALLTHROUGH;
25892     case Intrinsic::x86_sse41_ptestz:
25893     case Intrinsic::x86_avx_ptestz_256:
25894       // ZF = 1
25895       X86CC = X86::COND_E;
25896       break;
25897     case Intrinsic::x86_avx_vtestc_ps:
25898     case Intrinsic::x86_avx_vtestc_pd:
25899     case Intrinsic::x86_avx_vtestc_ps_256:
25900     case Intrinsic::x86_avx_vtestc_pd_256:
25901       TestOpc = X86ISD::TESTP;
25902       LLVM_FALLTHROUGH;
25903     case Intrinsic::x86_sse41_ptestc:
25904     case Intrinsic::x86_avx_ptestc_256:
25905       // CF = 1
25906       X86CC = X86::COND_B;
25907       break;
25908     case Intrinsic::x86_avx_vtestnzc_ps:
25909     case Intrinsic::x86_avx_vtestnzc_pd:
25910     case Intrinsic::x86_avx_vtestnzc_ps_256:
25911     case Intrinsic::x86_avx_vtestnzc_pd_256:
25912       TestOpc = X86ISD::TESTP;
25913       LLVM_FALLTHROUGH;
25914     case Intrinsic::x86_sse41_ptestnzc:
25915     case Intrinsic::x86_avx_ptestnzc_256:
25916       // ZF and CF = 0
25917       X86CC = X86::COND_A;
25918       break;
25919     }
25920 
25921     SDValue LHS = Op.getOperand(1);
25922     SDValue RHS = Op.getOperand(2);
25923     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25924     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25925     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25926   }
25927 
25928   case Intrinsic::x86_sse42_pcmpistria128:
25929   case Intrinsic::x86_sse42_pcmpestria128:
25930   case Intrinsic::x86_sse42_pcmpistric128:
25931   case Intrinsic::x86_sse42_pcmpestric128:
25932   case Intrinsic::x86_sse42_pcmpistrio128:
25933   case Intrinsic::x86_sse42_pcmpestrio128:
25934   case Intrinsic::x86_sse42_pcmpistris128:
25935   case Intrinsic::x86_sse42_pcmpestris128:
25936   case Intrinsic::x86_sse42_pcmpistriz128:
25937   case Intrinsic::x86_sse42_pcmpestriz128: {
25938     unsigned Opcode;
25939     X86::CondCode X86CC;
25940     switch (IntNo) {
25941     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25942     case Intrinsic::x86_sse42_pcmpistria128:
25943       Opcode = X86ISD::PCMPISTR;
25944       X86CC = X86::COND_A;
25945       break;
25946     case Intrinsic::x86_sse42_pcmpestria128:
25947       Opcode = X86ISD::PCMPESTR;
25948       X86CC = X86::COND_A;
25949       break;
25950     case Intrinsic::x86_sse42_pcmpistric128:
25951       Opcode = X86ISD::PCMPISTR;
25952       X86CC = X86::COND_B;
25953       break;
25954     case Intrinsic::x86_sse42_pcmpestric128:
25955       Opcode = X86ISD::PCMPESTR;
25956       X86CC = X86::COND_B;
25957       break;
25958     case Intrinsic::x86_sse42_pcmpistrio128:
25959       Opcode = X86ISD::PCMPISTR;
25960       X86CC = X86::COND_O;
25961       break;
25962     case Intrinsic::x86_sse42_pcmpestrio128:
25963       Opcode = X86ISD::PCMPESTR;
25964       X86CC = X86::COND_O;
25965       break;
25966     case Intrinsic::x86_sse42_pcmpistris128:
25967       Opcode = X86ISD::PCMPISTR;
25968       X86CC = X86::COND_S;
25969       break;
25970     case Intrinsic::x86_sse42_pcmpestris128:
25971       Opcode = X86ISD::PCMPESTR;
25972       X86CC = X86::COND_S;
25973       break;
25974     case Intrinsic::x86_sse42_pcmpistriz128:
25975       Opcode = X86ISD::PCMPISTR;
25976       X86CC = X86::COND_E;
25977       break;
25978     case Intrinsic::x86_sse42_pcmpestriz128:
25979       Opcode = X86ISD::PCMPESTR;
25980       X86CC = X86::COND_E;
25981       break;
25982     }
25983     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25984     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25985     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25986     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25987     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25988   }
25989 
25990   case Intrinsic::x86_sse42_pcmpistri128:
25991   case Intrinsic::x86_sse42_pcmpestri128: {
25992     unsigned Opcode;
25993     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25994       Opcode = X86ISD::PCMPISTR;
25995     else
25996       Opcode = X86ISD::PCMPESTR;
25997 
25998     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25999     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26000     return DAG.getNode(Opcode, dl, VTs, NewOps);
26001   }
26002 
26003   case Intrinsic::x86_sse42_pcmpistrm128:
26004   case Intrinsic::x86_sse42_pcmpestrm128: {
26005     unsigned Opcode;
26006     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26007       Opcode = X86ISD::PCMPISTR;
26008     else
26009       Opcode = X86ISD::PCMPESTR;
26010 
26011     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26012     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26013     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26014   }
26015 
26016   case Intrinsic::eh_sjlj_lsda: {
26017     MachineFunction &MF = DAG.getMachineFunction();
26018     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26019     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26020     auto &Context = MF.getMMI().getContext();
26021     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26022                                             Twine(MF.getFunctionNumber()));
26023     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26024                        DAG.getMCSymbol(S, PtrVT));
26025   }
26026 
26027   case Intrinsic::x86_seh_lsda: {
26028     // Compute the symbol for the LSDA. We know it'll get emitted later.
26029     MachineFunction &MF = DAG.getMachineFunction();
26030     SDValue Op1 = Op.getOperand(1);
26031     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26032     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26033         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26034 
26035     // Generate a simple absolute symbol reference. This intrinsic is only
26036     // supported on 32-bit Windows, which isn't PIC.
26037     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26038     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26039   }
26040 
26041   case Intrinsic::eh_recoverfp: {
26042     SDValue FnOp = Op.getOperand(1);
26043     SDValue IncomingFPOp = Op.getOperand(2);
26044     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26045     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26046     if (!Fn)
26047       report_fatal_error(
26048           "llvm.eh.recoverfp must take a function as the first argument");
26049     return recoverFramePointer(DAG, Fn, IncomingFPOp);
26050   }
26051 
26052   case Intrinsic::localaddress: {
26053     // Returns one of the stack, base, or frame pointer registers, depending on
26054     // which is used to reference local variables.
26055     MachineFunction &MF = DAG.getMachineFunction();
26056     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26057     unsigned Reg;
26058     if (RegInfo->hasBasePointer(MF))
26059       Reg = RegInfo->getBaseRegister();
26060     else { // Handles the SP or FP case.
26061       bool CantUseFP = RegInfo->hasStackRealignment(MF);
26062       if (CantUseFP)
26063         Reg = RegInfo->getPtrSizedStackRegister(MF);
26064       else
26065         Reg = RegInfo->getPtrSizedFrameRegister(MF);
26066     }
26067     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26068   }
26069   case Intrinsic::swift_async_context_addr: {
26070     auto &MF = DAG.getMachineFunction();
26071     auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26072     if (Subtarget.is64Bit()) {
26073       MF.getFrameInfo().setFrameAddressIsTaken(true);
26074       X86FI->setHasSwiftAsyncContext(true);
26075       return SDValue(
26076           DAG.getMachineNode(
26077               X86::SUB64ri8, dl, MVT::i64,
26078               DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26079               DAG.getTargetConstant(8, dl, MVT::i32)),
26080           0);
26081     } else {
26082       // 32-bit so no special extended frame, create or reuse an existing stack
26083       // slot.
26084       if (!X86FI->getSwiftAsyncContextFrameIdx())
26085         X86FI->setSwiftAsyncContextFrameIdx(
26086             MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26087       return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26088     }
26089   }
26090   case Intrinsic::x86_avx512_vp2intersect_q_512:
26091   case Intrinsic::x86_avx512_vp2intersect_q_256:
26092   case Intrinsic::x86_avx512_vp2intersect_q_128:
26093   case Intrinsic::x86_avx512_vp2intersect_d_512:
26094   case Intrinsic::x86_avx512_vp2intersect_d_256:
26095   case Intrinsic::x86_avx512_vp2intersect_d_128: {
26096     MVT MaskVT = Op.getSimpleValueType();
26097 
26098     SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26099     SDLoc DL(Op);
26100 
26101     SDValue Operation =
26102         DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26103                     Op->getOperand(1), Op->getOperand(2));
26104 
26105     SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26106                                                  MaskVT, Operation);
26107     SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26108                                                  MaskVT, Operation);
26109     return DAG.getMergeValues({Result0, Result1}, DL);
26110   }
26111   case Intrinsic::x86_mmx_pslli_w:
26112   case Intrinsic::x86_mmx_pslli_d:
26113   case Intrinsic::x86_mmx_pslli_q:
26114   case Intrinsic::x86_mmx_psrli_w:
26115   case Intrinsic::x86_mmx_psrli_d:
26116   case Intrinsic::x86_mmx_psrli_q:
26117   case Intrinsic::x86_mmx_psrai_w:
26118   case Intrinsic::x86_mmx_psrai_d: {
26119     SDLoc DL(Op);
26120     SDValue ShAmt = Op.getOperand(2);
26121     // If the argument is a constant, convert it to a target constant.
26122     if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26123       // Clamp out of bounds shift amounts since they will otherwise be masked
26124       // to 8-bits which may make it no longer out of bounds.
26125       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26126       if (ShiftAmount == 0)
26127         return Op.getOperand(1);
26128 
26129       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26130                          Op.getOperand(0), Op.getOperand(1),
26131                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26132     }
26133 
26134     unsigned NewIntrinsic;
26135     switch (IntNo) {
26136     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
26137     case Intrinsic::x86_mmx_pslli_w:
26138       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26139       break;
26140     case Intrinsic::x86_mmx_pslli_d:
26141       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26142       break;
26143     case Intrinsic::x86_mmx_pslli_q:
26144       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26145       break;
26146     case Intrinsic::x86_mmx_psrli_w:
26147       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26148       break;
26149     case Intrinsic::x86_mmx_psrli_d:
26150       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26151       break;
26152     case Intrinsic::x86_mmx_psrli_q:
26153       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26154       break;
26155     case Intrinsic::x86_mmx_psrai_w:
26156       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26157       break;
26158     case Intrinsic::x86_mmx_psrai_d:
26159       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26160       break;
26161     }
26162 
26163     // The vector shift intrinsics with scalars uses 32b shift amounts but
26164     // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26165     // MMX register.
26166     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26167     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26168                        DAG.getTargetConstant(NewIntrinsic, DL,
26169                                              getPointerTy(DAG.getDataLayout())),
26170                        Op.getOperand(1), ShAmt);
26171   }
26172   }
26173 }
26174 
26175 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26176                                  SDValue Src, SDValue Mask, SDValue Base,
26177                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
26178                                  const X86Subtarget &Subtarget) {
26179   SDLoc dl(Op);
26180   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26181   // Scale must be constant.
26182   if (!C)
26183     return SDValue();
26184   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26185   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26186                                         TLI.getPointerTy(DAG.getDataLayout()));
26187   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26188   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26189   // If source is undef or we know it won't be used, use a zero vector
26190   // to break register dependency.
26191   // TODO: use undef instead and let BreakFalseDeps deal with it?
26192   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26193     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26194 
26195   // Cast mask to an integer type.
26196   Mask = DAG.getBitcast(MaskVT, Mask);
26197 
26198   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26199 
26200   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26201   SDValue Res =
26202       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26203                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26204   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26205 }
26206 
26207 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26208                              SDValue Src, SDValue Mask, SDValue Base,
26209                              SDValue Index, SDValue ScaleOp, SDValue Chain,
26210                              const X86Subtarget &Subtarget) {
26211   MVT VT = Op.getSimpleValueType();
26212   SDLoc dl(Op);
26213   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26214   // Scale must be constant.
26215   if (!C)
26216     return SDValue();
26217   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26218   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26219                                         TLI.getPointerTy(DAG.getDataLayout()));
26220   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26221                               VT.getVectorNumElements());
26222   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26223 
26224   // We support two versions of the gather intrinsics. One with scalar mask and
26225   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26226   if (Mask.getValueType() != MaskVT)
26227     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26228 
26229   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26230   // If source is undef or we know it won't be used, use a zero vector
26231   // to break register dependency.
26232   // TODO: use undef instead and let BreakFalseDeps deal with it?
26233   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26234     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26235 
26236   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26237 
26238   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26239   SDValue Res =
26240       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26241                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26242   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26243 }
26244 
26245 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26246                                SDValue Src, SDValue Mask, SDValue Base,
26247                                SDValue Index, SDValue ScaleOp, SDValue Chain,
26248                                const X86Subtarget &Subtarget) {
26249   SDLoc dl(Op);
26250   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26251   // Scale must be constant.
26252   if (!C)
26253     return SDValue();
26254   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26255   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26256                                         TLI.getPointerTy(DAG.getDataLayout()));
26257   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26258                               Src.getSimpleValueType().getVectorNumElements());
26259   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26260 
26261   // We support two versions of the scatter intrinsics. One with scalar mask and
26262   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26263   if (Mask.getValueType() != MaskVT)
26264     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26265 
26266   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26267 
26268   SDVTList VTs = DAG.getVTList(MVT::Other);
26269   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26270   SDValue Res =
26271       DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26272                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26273   return Res;
26274 }
26275 
26276 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26277                                SDValue Mask, SDValue Base, SDValue Index,
26278                                SDValue ScaleOp, SDValue Chain,
26279                                const X86Subtarget &Subtarget) {
26280   SDLoc dl(Op);
26281   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26282   // Scale must be constant.
26283   if (!C)
26284     return SDValue();
26285   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26286   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26287                                         TLI.getPointerTy(DAG.getDataLayout()));
26288   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26289   SDValue Segment = DAG.getRegister(0, MVT::i32);
26290   MVT MaskVT =
26291     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26292   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26293   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26294   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26295   return SDValue(Res, 0);
26296 }
26297 
26298 /// Handles the lowering of builtin intrinsics with chain that return their
26299 /// value into registers EDX:EAX.
26300 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
26301 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26302 /// TargetOpcode.
26303 /// Returns a Glue value which can be used to add extra copy-from-reg if the
26304 /// expanded intrinsics implicitly defines extra registers (i.e. not just
26305 /// EDX:EAX).
26306 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26307                                         SelectionDAG &DAG,
26308                                         unsigned TargetOpcode,
26309                                         unsigned SrcReg,
26310                                         const X86Subtarget &Subtarget,
26311                                         SmallVectorImpl<SDValue> &Results) {
26312   SDValue Chain = N->getOperand(0);
26313   SDValue Glue;
26314 
26315   if (SrcReg) {
26316     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26317     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26318     Glue = Chain.getValue(1);
26319   }
26320 
26321   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26322   SDValue N1Ops[] = {Chain, Glue};
26323   SDNode *N1 = DAG.getMachineNode(
26324       TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26325   Chain = SDValue(N1, 0);
26326 
26327   // Reads the content of XCR and returns it in registers EDX:EAX.
26328   SDValue LO, HI;
26329   if (Subtarget.is64Bit()) {
26330     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26331     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26332                             LO.getValue(2));
26333   } else {
26334     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26335     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26336                             LO.getValue(2));
26337   }
26338   Chain = HI.getValue(1);
26339   Glue = HI.getValue(2);
26340 
26341   if (Subtarget.is64Bit()) {
26342     // Merge the two 32-bit values into a 64-bit one.
26343     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26344                               DAG.getConstant(32, DL, MVT::i8));
26345     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26346     Results.push_back(Chain);
26347     return Glue;
26348   }
26349 
26350   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26351   SDValue Ops[] = { LO, HI };
26352   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26353   Results.push_back(Pair);
26354   Results.push_back(Chain);
26355   return Glue;
26356 }
26357 
26358 /// Handles the lowering of builtin intrinsics that read the time stamp counter
26359 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26360 /// READCYCLECOUNTER nodes.
26361 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26362                                     SelectionDAG &DAG,
26363                                     const X86Subtarget &Subtarget,
26364                                     SmallVectorImpl<SDValue> &Results) {
26365   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26366   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26367   // and the EAX register is loaded with the low-order 32 bits.
26368   SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26369                                              /* NoRegister */0, Subtarget,
26370                                              Results);
26371   if (Opcode != X86::RDTSCP)
26372     return;
26373 
26374   SDValue Chain = Results[1];
26375   // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26376   // the ECX register. Add 'ecx' explicitly to the chain.
26377   SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26378   Results[1] = ecx;
26379   Results.push_back(ecx.getValue(1));
26380 }
26381 
26382 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26383                                      SelectionDAG &DAG) {
26384   SmallVector<SDValue, 3> Results;
26385   SDLoc DL(Op);
26386   getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26387                           Results);
26388   return DAG.getMergeValues(Results, DL);
26389 }
26390 
26391 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26392   MachineFunction &MF = DAG.getMachineFunction();
26393   SDValue Chain = Op.getOperand(0);
26394   SDValue RegNode = Op.getOperand(2);
26395   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26396   if (!EHInfo)
26397     report_fatal_error("EH registrations only live in functions using WinEH");
26398 
26399   // Cast the operand to an alloca, and remember the frame index.
26400   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26401   if (!FINode)
26402     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26403   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26404 
26405   // Return the chain operand without making any DAG nodes.
26406   return Chain;
26407 }
26408 
26409 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26410   MachineFunction &MF = DAG.getMachineFunction();
26411   SDValue Chain = Op.getOperand(0);
26412   SDValue EHGuard = Op.getOperand(2);
26413   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26414   if (!EHInfo)
26415     report_fatal_error("EHGuard only live in functions using WinEH");
26416 
26417   // Cast the operand to an alloca, and remember the frame index.
26418   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26419   if (!FINode)
26420     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26421   EHInfo->EHGuardFrameIndex = FINode->getIndex();
26422 
26423   // Return the chain operand without making any DAG nodes.
26424   return Chain;
26425 }
26426 
26427 /// Emit Truncating Store with signed or unsigned saturation.
26428 static SDValue
26429 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26430                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26431                 SelectionDAG &DAG) {
26432   SDVTList VTs = DAG.getVTList(MVT::Other);
26433   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26434   SDValue Ops[] = { Chain, Val, Ptr, Undef };
26435   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26436   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26437 }
26438 
26439 /// Emit Masked Truncating Store with signed or unsigned saturation.
26440 static SDValue
26441 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26442                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26443                       MachineMemOperand *MMO, SelectionDAG &DAG) {
26444   SDVTList VTs = DAG.getVTList(MVT::Other);
26445   SDValue Ops[] = { Chain, Val, Ptr, Mask };
26446   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26447   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26448 }
26449 
26450 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26451                                       SelectionDAG &DAG) {
26452   unsigned IntNo = Op.getConstantOperandVal(1);
26453   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26454   if (!IntrData) {
26455     switch (IntNo) {
26456     case llvm::Intrinsic::x86_seh_ehregnode:
26457       return MarkEHRegistrationNode(Op, DAG);
26458     case llvm::Intrinsic::x86_seh_ehguard:
26459       return MarkEHGuard(Op, DAG);
26460     case llvm::Intrinsic::x86_rdpkru: {
26461       SDLoc dl(Op);
26462       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26463       // Create a RDPKRU node and pass 0 to the ECX parameter.
26464       return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26465                          DAG.getConstant(0, dl, MVT::i32));
26466     }
26467     case llvm::Intrinsic::x86_wrpkru: {
26468       SDLoc dl(Op);
26469       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
26470       // to the EDX and ECX parameters.
26471       return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26472                          Op.getOperand(0), Op.getOperand(2),
26473                          DAG.getConstant(0, dl, MVT::i32),
26474                          DAG.getConstant(0, dl, MVT::i32));
26475     }
26476     case llvm::Intrinsic::x86_flags_read_u32:
26477     case llvm::Intrinsic::x86_flags_read_u64:
26478     case llvm::Intrinsic::x86_flags_write_u32:
26479     case llvm::Intrinsic::x86_flags_write_u64: {
26480       // We need a frame pointer because this will get lowered to a PUSH/POP
26481       // sequence.
26482       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26483       MFI.setHasCopyImplyingStackAdjustment(true);
26484       // Don't do anything here, we will expand these intrinsics out later
26485       // during FinalizeISel in EmitInstrWithCustomInserter.
26486       return Op;
26487     }
26488     case Intrinsic::x86_lwpins32:
26489     case Intrinsic::x86_lwpins64:
26490     case Intrinsic::x86_umwait:
26491     case Intrinsic::x86_tpause: {
26492       SDLoc dl(Op);
26493       SDValue Chain = Op->getOperand(0);
26494       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26495       unsigned Opcode;
26496 
26497       switch (IntNo) {
26498       default: llvm_unreachable("Impossible intrinsic");
26499       case Intrinsic::x86_umwait:
26500         Opcode = X86ISD::UMWAIT;
26501         break;
26502       case Intrinsic::x86_tpause:
26503         Opcode = X86ISD::TPAUSE;
26504         break;
26505       case Intrinsic::x86_lwpins32:
26506       case Intrinsic::x86_lwpins64:
26507         Opcode = X86ISD::LWPINS;
26508         break;
26509       }
26510 
26511       SDValue Operation =
26512           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26513                       Op->getOperand(3), Op->getOperand(4));
26514       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26515       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26516                          Operation.getValue(1));
26517     }
26518     case Intrinsic::x86_enqcmd:
26519     case Intrinsic::x86_enqcmds: {
26520       SDLoc dl(Op);
26521       SDValue Chain = Op.getOperand(0);
26522       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26523       unsigned Opcode;
26524       switch (IntNo) {
26525       default: llvm_unreachable("Impossible intrinsic!");
26526       case Intrinsic::x86_enqcmd:
26527         Opcode = X86ISD::ENQCMD;
26528         break;
26529       case Intrinsic::x86_enqcmds:
26530         Opcode = X86ISD::ENQCMDS;
26531         break;
26532       }
26533       SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26534                                       Op.getOperand(3));
26535       SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26536       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26537                          Operation.getValue(1));
26538     }
26539     case Intrinsic::x86_aesenc128kl:
26540     case Intrinsic::x86_aesdec128kl:
26541     case Intrinsic::x86_aesenc256kl:
26542     case Intrinsic::x86_aesdec256kl: {
26543       SDLoc DL(Op);
26544       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26545       SDValue Chain = Op.getOperand(0);
26546       unsigned Opcode;
26547 
26548       switch (IntNo) {
26549       default: llvm_unreachable("Impossible intrinsic");
26550       case Intrinsic::x86_aesenc128kl:
26551         Opcode = X86ISD::AESENC128KL;
26552         break;
26553       case Intrinsic::x86_aesdec128kl:
26554         Opcode = X86ISD::AESDEC128KL;
26555         break;
26556       case Intrinsic::x86_aesenc256kl:
26557         Opcode = X86ISD::AESENC256KL;
26558         break;
26559       case Intrinsic::x86_aesdec256kl:
26560         Opcode = X86ISD::AESDEC256KL;
26561         break;
26562       }
26563 
26564       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26565       MachineMemOperand *MMO = MemIntr->getMemOperand();
26566       EVT MemVT = MemIntr->getMemoryVT();
26567       SDValue Operation = DAG.getMemIntrinsicNode(
26568           Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26569           MMO);
26570       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26571 
26572       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26573                          {ZF, Operation.getValue(0), Operation.getValue(2)});
26574     }
26575     case Intrinsic::x86_aesencwide128kl:
26576     case Intrinsic::x86_aesdecwide128kl:
26577     case Intrinsic::x86_aesencwide256kl:
26578     case Intrinsic::x86_aesdecwide256kl: {
26579       SDLoc DL(Op);
26580       SDVTList VTs = DAG.getVTList(
26581           {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26582            MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26583       SDValue Chain = Op.getOperand(0);
26584       unsigned Opcode;
26585 
26586       switch (IntNo) {
26587       default: llvm_unreachable("Impossible intrinsic");
26588       case Intrinsic::x86_aesencwide128kl:
26589         Opcode = X86ISD::AESENCWIDE128KL;
26590         break;
26591       case Intrinsic::x86_aesdecwide128kl:
26592         Opcode = X86ISD::AESDECWIDE128KL;
26593         break;
26594       case Intrinsic::x86_aesencwide256kl:
26595         Opcode = X86ISD::AESENCWIDE256KL;
26596         break;
26597       case Intrinsic::x86_aesdecwide256kl:
26598         Opcode = X86ISD::AESDECWIDE256KL;
26599         break;
26600       }
26601 
26602       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26603       MachineMemOperand *MMO = MemIntr->getMemOperand();
26604       EVT MemVT = MemIntr->getMemoryVT();
26605       SDValue Operation = DAG.getMemIntrinsicNode(
26606           Opcode, DL, VTs,
26607           {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26608            Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26609            Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26610           MemVT, MMO);
26611       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26612 
26613       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26614                          {ZF, Operation.getValue(1), Operation.getValue(2),
26615                           Operation.getValue(3), Operation.getValue(4),
26616                           Operation.getValue(5), Operation.getValue(6),
26617                           Operation.getValue(7), Operation.getValue(8),
26618                           Operation.getValue(9)});
26619     }
26620     case Intrinsic::x86_testui: {
26621       SDLoc dl(Op);
26622       SDValue Chain = Op.getOperand(0);
26623       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26624       SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26625       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26626       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26627                          Operation.getValue(1));
26628     }
26629     }
26630     return SDValue();
26631   }
26632 
26633   SDLoc dl(Op);
26634   switch(IntrData->Type) {
26635   default: llvm_unreachable("Unknown Intrinsic Type");
26636   case RDSEED:
26637   case RDRAND: {
26638     // Emit the node with the right value type.
26639     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26640     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26641 
26642     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26643     // Otherwise return the value from Rand, which is always 0, casted to i32.
26644     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26645                      DAG.getConstant(1, dl, Op->getValueType(1)),
26646                      DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26647                      SDValue(Result.getNode(), 1)};
26648     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26649 
26650     // Return { result, isValid, chain }.
26651     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26652                        SDValue(Result.getNode(), 2));
26653   }
26654   case GATHER_AVX2: {
26655     SDValue Chain = Op.getOperand(0);
26656     SDValue Src   = Op.getOperand(2);
26657     SDValue Base  = Op.getOperand(3);
26658     SDValue Index = Op.getOperand(4);
26659     SDValue Mask  = Op.getOperand(5);
26660     SDValue Scale = Op.getOperand(6);
26661     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26662                              Scale, Chain, Subtarget);
26663   }
26664   case GATHER: {
26665   //gather(v1, mask, index, base, scale);
26666     SDValue Chain = Op.getOperand(0);
26667     SDValue Src   = Op.getOperand(2);
26668     SDValue Base  = Op.getOperand(3);
26669     SDValue Index = Op.getOperand(4);
26670     SDValue Mask  = Op.getOperand(5);
26671     SDValue Scale = Op.getOperand(6);
26672     return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26673                          Chain, Subtarget);
26674   }
26675   case SCATTER: {
26676   //scatter(base, mask, index, v1, scale);
26677     SDValue Chain = Op.getOperand(0);
26678     SDValue Base  = Op.getOperand(2);
26679     SDValue Mask  = Op.getOperand(3);
26680     SDValue Index = Op.getOperand(4);
26681     SDValue Src   = Op.getOperand(5);
26682     SDValue Scale = Op.getOperand(6);
26683     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26684                           Scale, Chain, Subtarget);
26685   }
26686   case PREFETCH: {
26687     const APInt &HintVal = Op.getConstantOperandAPInt(6);
26688     assert((HintVal == 2 || HintVal == 3) &&
26689            "Wrong prefetch hint in intrinsic: should be 2 or 3");
26690     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26691     SDValue Chain = Op.getOperand(0);
26692     SDValue Mask  = Op.getOperand(2);
26693     SDValue Index = Op.getOperand(3);
26694     SDValue Base  = Op.getOperand(4);
26695     SDValue Scale = Op.getOperand(5);
26696     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26697                            Subtarget);
26698   }
26699   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26700   case RDTSC: {
26701     SmallVector<SDValue, 2> Results;
26702     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26703                             Results);
26704     return DAG.getMergeValues(Results, dl);
26705   }
26706   // Read Performance Monitoring Counters.
26707   case RDPMC:
26708   // GetExtended Control Register.
26709   case XGETBV: {
26710     SmallVector<SDValue, 2> Results;
26711 
26712     // RDPMC uses ECX to select the index of the performance counter to read.
26713     // XGETBV uses ECX to select the index of the XCR register to return.
26714     // The result is stored into registers EDX:EAX.
26715     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26716                                 Subtarget, Results);
26717     return DAG.getMergeValues(Results, dl);
26718   }
26719   // XTEST intrinsics.
26720   case XTEST: {
26721     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26722     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26723 
26724     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26725     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26726     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26727                        Ret, SDValue(InTrans.getNode(), 1));
26728   }
26729   case TRUNCATE_TO_MEM_VI8:
26730   case TRUNCATE_TO_MEM_VI16:
26731   case TRUNCATE_TO_MEM_VI32: {
26732     SDValue Mask = Op.getOperand(4);
26733     SDValue DataToTruncate = Op.getOperand(3);
26734     SDValue Addr = Op.getOperand(2);
26735     SDValue Chain = Op.getOperand(0);
26736 
26737     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26738     assert(MemIntr && "Expected MemIntrinsicSDNode!");
26739 
26740     EVT MemVT  = MemIntr->getMemoryVT();
26741 
26742     uint16_t TruncationOp = IntrData->Opc0;
26743     switch (TruncationOp) {
26744     case X86ISD::VTRUNC: {
26745       if (isAllOnesConstant(Mask)) // return just a truncate store
26746         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26747                                  MemIntr->getMemOperand());
26748 
26749       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26750       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26751       SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26752 
26753       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26754                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26755                                 true /* truncating */);
26756     }
26757     case X86ISD::VTRUNCUS:
26758     case X86ISD::VTRUNCS: {
26759       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26760       if (isAllOnesConstant(Mask))
26761         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26762                                MemIntr->getMemOperand(), DAG);
26763 
26764       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26765       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26766 
26767       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26768                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
26769     }
26770     default:
26771       llvm_unreachable("Unsupported truncstore intrinsic");
26772     }
26773   }
26774   }
26775 }
26776 
26777 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26778                                            SelectionDAG &DAG) const {
26779   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26780   MFI.setReturnAddressIsTaken(true);
26781 
26782   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26783     return SDValue();
26784 
26785   unsigned Depth = Op.getConstantOperandVal(0);
26786   SDLoc dl(Op);
26787   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26788 
26789   if (Depth > 0) {
26790     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26791     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26792     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26793     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26794                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26795                        MachinePointerInfo());
26796   }
26797 
26798   // Just load the return address.
26799   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26800   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26801                      MachinePointerInfo());
26802 }
26803 
26804 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26805                                                  SelectionDAG &DAG) const {
26806   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26807   return getReturnAddressFrameIndex(DAG);
26808 }
26809 
26810 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26811   MachineFunction &MF = DAG.getMachineFunction();
26812   MachineFrameInfo &MFI = MF.getFrameInfo();
26813   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26814   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26815   EVT VT = Op.getValueType();
26816 
26817   MFI.setFrameAddressIsTaken(true);
26818 
26819   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26820     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
26821     // is not possible to crawl up the stack without looking at the unwind codes
26822     // simultaneously.
26823     int FrameAddrIndex = FuncInfo->getFAIndex();
26824     if (!FrameAddrIndex) {
26825       // Set up a frame object for the return address.
26826       unsigned SlotSize = RegInfo->getSlotSize();
26827       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26828           SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26829       FuncInfo->setFAIndex(FrameAddrIndex);
26830     }
26831     return DAG.getFrameIndex(FrameAddrIndex, VT);
26832   }
26833 
26834   unsigned FrameReg =
26835       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26836   SDLoc dl(Op);  // FIXME probably not meaningful
26837   unsigned Depth = Op.getConstantOperandVal(0);
26838   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
26839           (FrameReg == X86::EBP && VT == MVT::i32)) &&
26840          "Invalid Frame Register!");
26841   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26842   while (Depth--)
26843     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26844                             MachinePointerInfo());
26845   return FrameAddr;
26846 }
26847 
26848 // FIXME? Maybe this could be a TableGen attribute on some registers and
26849 // this table could be generated automatically from RegInfo.
26850 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26851                                               const MachineFunction &MF) const {
26852   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26853 
26854   Register Reg = StringSwitch<unsigned>(RegName)
26855                        .Case("esp", X86::ESP)
26856                        .Case("rsp", X86::RSP)
26857                        .Case("ebp", X86::EBP)
26858                        .Case("rbp", X86::RBP)
26859                        .Default(0);
26860 
26861   if (Reg == X86::EBP || Reg == X86::RBP) {
26862     if (!TFI.hasFP(MF))
26863       report_fatal_error("register " + StringRef(RegName) +
26864                          " is allocatable: function has no frame pointer");
26865 #ifndef NDEBUG
26866     else {
26867       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26868       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26869       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
26870              "Invalid Frame Register!");
26871     }
26872 #endif
26873   }
26874 
26875   if (Reg)
26876     return Reg;
26877 
26878   report_fatal_error("Invalid register name global variable");
26879 }
26880 
26881 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26882                                                      SelectionDAG &DAG) const {
26883   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26884   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26885 }
26886 
26887 Register X86TargetLowering::getExceptionPointerRegister(
26888     const Constant *PersonalityFn) const {
26889   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26890     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26891 
26892   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26893 }
26894 
26895 Register X86TargetLowering::getExceptionSelectorRegister(
26896     const Constant *PersonalityFn) const {
26897   // Funclet personalities don't use selectors (the runtime does the selection).
26898   if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26899     return X86::NoRegister;
26900   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26901 }
26902 
26903 bool X86TargetLowering::needsFixedCatchObjects() const {
26904   return Subtarget.isTargetWin64();
26905 }
26906 
26907 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26908   SDValue Chain     = Op.getOperand(0);
26909   SDValue Offset    = Op.getOperand(1);
26910   SDValue Handler   = Op.getOperand(2);
26911   SDLoc dl      (Op);
26912 
26913   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26914   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26915   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26916   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
26917           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
26918          "Invalid Frame Register!");
26919   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26920   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26921 
26922   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26923                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26924                                                        dl));
26925   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26926   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26927   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26928 
26929   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26930                      DAG.getRegister(StoreAddrReg, PtrVT));
26931 }
26932 
26933 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26934                                                SelectionDAG &DAG) const {
26935   SDLoc DL(Op);
26936   // If the subtarget is not 64bit, we may need the global base reg
26937   // after isel expand pseudo, i.e., after CGBR pass ran.
26938   // Therefore, ask for the GlobalBaseReg now, so that the pass
26939   // inserts the code for us in case we need it.
26940   // Otherwise, we will end up in a situation where we will
26941   // reference a virtual register that is not defined!
26942   if (!Subtarget.is64Bit()) {
26943     const X86InstrInfo *TII = Subtarget.getInstrInfo();
26944     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26945   }
26946   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26947                      DAG.getVTList(MVT::i32, MVT::Other),
26948                      Op.getOperand(0), Op.getOperand(1));
26949 }
26950 
26951 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26952                                                 SelectionDAG &DAG) const {
26953   SDLoc DL(Op);
26954   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26955                      Op.getOperand(0), Op.getOperand(1));
26956 }
26957 
26958 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26959                                                        SelectionDAG &DAG) const {
26960   SDLoc DL(Op);
26961   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26962                      Op.getOperand(0));
26963 }
26964 
26965 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26966   return Op.getOperand(0);
26967 }
26968 
26969 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26970                                                 SelectionDAG &DAG) const {
26971   SDValue Root = Op.getOperand(0);
26972   SDValue Trmp = Op.getOperand(1); // trampoline
26973   SDValue FPtr = Op.getOperand(2); // nested function
26974   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26975   SDLoc dl (Op);
26976 
26977   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26978   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26979 
26980   if (Subtarget.is64Bit()) {
26981     SDValue OutChains[6];
26982 
26983     // Large code-model.
26984     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
26985     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26986 
26987     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26988     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26989 
26990     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26991 
26992     // Load the pointer to the nested function into R11.
26993     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26994     SDValue Addr = Trmp;
26995     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26996                                 Addr, MachinePointerInfo(TrmpAddr));
26997 
26998     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26999                        DAG.getConstant(2, dl, MVT::i64));
27000     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27001                                 MachinePointerInfo(TrmpAddr, 2), Align(2));
27002 
27003     // Load the 'nest' parameter value into R10.
27004     // R10 is specified in X86CallingConv.td
27005     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27006     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27007                        DAG.getConstant(10, dl, MVT::i64));
27008     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27009                                 Addr, MachinePointerInfo(TrmpAddr, 10));
27010 
27011     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27012                        DAG.getConstant(12, dl, MVT::i64));
27013     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27014                                 MachinePointerInfo(TrmpAddr, 12), Align(2));
27015 
27016     // Jump to the nested function.
27017     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27018     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27019                        DAG.getConstant(20, dl, MVT::i64));
27020     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27021                                 Addr, MachinePointerInfo(TrmpAddr, 20));
27022 
27023     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27024     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27025                        DAG.getConstant(22, dl, MVT::i64));
27026     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27027                                 Addr, MachinePointerInfo(TrmpAddr, 22));
27028 
27029     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27030   } else {
27031     const Function *Func =
27032       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27033     CallingConv::ID CC = Func->getCallingConv();
27034     unsigned NestReg;
27035 
27036     switch (CC) {
27037     default:
27038       llvm_unreachable("Unsupported calling convention");
27039     case CallingConv::C:
27040     case CallingConv::X86_StdCall: {
27041       // Pass 'nest' parameter in ECX.
27042       // Must be kept in sync with X86CallingConv.td
27043       NestReg = X86::ECX;
27044 
27045       // Check that ECX wasn't needed by an 'inreg' parameter.
27046       FunctionType *FTy = Func->getFunctionType();
27047       const AttributeList &Attrs = Func->getAttributes();
27048 
27049       if (!Attrs.isEmpty() && !Func->isVarArg()) {
27050         unsigned InRegCount = 0;
27051         unsigned Idx = 1;
27052 
27053         for (FunctionType::param_iterator I = FTy->param_begin(),
27054              E = FTy->param_end(); I != E; ++I, ++Idx)
27055           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
27056             const DataLayout &DL = DAG.getDataLayout();
27057             // FIXME: should only count parameters that are lowered to integers.
27058             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27059           }
27060 
27061         if (InRegCount > 2) {
27062           report_fatal_error("Nest register in use - reduce number of inreg"
27063                              " parameters!");
27064         }
27065       }
27066       break;
27067     }
27068     case CallingConv::X86_FastCall:
27069     case CallingConv::X86_ThisCall:
27070     case CallingConv::Fast:
27071     case CallingConv::Tail:
27072     case CallingConv::SwiftTail:
27073       // Pass 'nest' parameter in EAX.
27074       // Must be kept in sync with X86CallingConv.td
27075       NestReg = X86::EAX;
27076       break;
27077     }
27078 
27079     SDValue OutChains[4];
27080     SDValue Addr, Disp;
27081 
27082     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27083                        DAG.getConstant(10, dl, MVT::i32));
27084     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27085 
27086     // This is storing the opcode for MOV32ri.
27087     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27088     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27089     OutChains[0] =
27090         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27091                      Trmp, MachinePointerInfo(TrmpAddr));
27092 
27093     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27094                        DAG.getConstant(1, dl, MVT::i32));
27095     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27096                                 MachinePointerInfo(TrmpAddr, 1), Align(1));
27097 
27098     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27099     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27100                        DAG.getConstant(5, dl, MVT::i32));
27101     OutChains[2] =
27102         DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27103                      MachinePointerInfo(TrmpAddr, 5), Align(1));
27104 
27105     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27106                        DAG.getConstant(6, dl, MVT::i32));
27107     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27108                                 MachinePointerInfo(TrmpAddr, 6), Align(1));
27109 
27110     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27111   }
27112 }
27113 
27114 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27115                                             SelectionDAG &DAG) const {
27116   /*
27117    The rounding mode is in bits 11:10 of FPSR, and has the following
27118    settings:
27119      00 Round to nearest
27120      01 Round to -inf
27121      10 Round to +inf
27122      11 Round to 0
27123 
27124   FLT_ROUNDS, on the other hand, expects the following:
27125     -1 Undefined
27126      0 Round to 0
27127      1 Round to nearest
27128      2 Round to +inf
27129      3 Round to -inf
27130 
27131   To perform the conversion, we use a packed lookup table of the four 2-bit
27132   values that we can index by FPSP[11:10]
27133     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27134 
27135     (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27136   */
27137 
27138   MachineFunction &MF = DAG.getMachineFunction();
27139   MVT VT = Op.getSimpleValueType();
27140   SDLoc DL(Op);
27141 
27142   // Save FP Control Word to stack slot
27143   int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27144   SDValue StackSlot =
27145       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27146 
27147   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27148 
27149   SDValue Chain = Op.getOperand(0);
27150   SDValue Ops[] = {Chain, StackSlot};
27151   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27152                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27153                                   Align(2), MachineMemOperand::MOStore);
27154 
27155   // Load FP Control Word from stack slot
27156   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27157   Chain = CWD.getValue(1);
27158 
27159   // Mask and turn the control bits into a shift for the lookup table.
27160   SDValue Shift =
27161     DAG.getNode(ISD::SRL, DL, MVT::i16,
27162                 DAG.getNode(ISD::AND, DL, MVT::i16,
27163                             CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27164                 DAG.getConstant(9, DL, MVT::i8));
27165   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27166 
27167   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27168   SDValue RetVal =
27169     DAG.getNode(ISD::AND, DL, MVT::i32,
27170                 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27171                 DAG.getConstant(3, DL, MVT::i32));
27172 
27173   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27174 
27175   return DAG.getMergeValues({RetVal, Chain}, DL);
27176 }
27177 
27178 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27179                                              SelectionDAG &DAG) const {
27180   MachineFunction &MF = DAG.getMachineFunction();
27181   SDLoc DL(Op);
27182   SDValue Chain = Op.getNode()->getOperand(0);
27183 
27184   // FP control word may be set only from data in memory. So we need to allocate
27185   // stack space to save/load FP control word.
27186   int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27187   SDValue StackSlot =
27188       DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27189   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27190   MachineMemOperand *MMO =
27191       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27192 
27193   // Store FP control word into memory.
27194   SDValue Ops[] = {Chain, StackSlot};
27195   Chain = DAG.getMemIntrinsicNode(
27196       X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27197 
27198   // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27199   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27200   Chain = CWD.getValue(1);
27201   CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27202                     DAG.getConstant(0xf3ff, DL, MVT::i16));
27203 
27204   // Calculate new rounding mode.
27205   SDValue NewRM = Op.getNode()->getOperand(1);
27206   SDValue RMBits;
27207   if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27208     uint64_t RM = CVal->getZExtValue();
27209     int FieldVal;
27210     switch (static_cast<RoundingMode>(RM)) {
27211     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27212     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
27213     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
27214     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
27215     default:
27216       llvm_unreachable("rounding mode is not supported by X86 hardware");
27217     }
27218     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27219   } else {
27220     // Need to convert argument into bits of control word:
27221     //    0 Round to 0       -> 11
27222     //    1 Round to nearest -> 00
27223     //    2 Round to +inf    -> 10
27224     //    3 Round to -inf    -> 01
27225     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27226     // To make the conversion, put all these values into a value 0xc9 and shift
27227     // it left depending on the rounding mode:
27228     //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27229     //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
27230     //    ...
27231     // (0xc9 << (2 * NewRM + 4)) & 0xc00
27232     SDValue ShiftValue =
27233         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27234                     DAG.getNode(ISD::ADD, DL, MVT::i32,
27235                                 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27236                                             DAG.getConstant(1, DL, MVT::i8)),
27237                                 DAG.getConstant(4, DL, MVT::i32)));
27238     SDValue Shifted =
27239         DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27240                     ShiftValue);
27241     RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27242                          DAG.getConstant(0xc00, DL, MVT::i16));
27243   }
27244 
27245   // Update rounding mode bits and store the new FP Control Word into stack.
27246   CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27247   Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27248 
27249   // Load FP control word from the slot.
27250   SDValue OpsLD[] = {Chain, StackSlot};
27251   MachineMemOperand *MMOL =
27252       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27253   Chain = DAG.getMemIntrinsicNode(
27254       X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27255 
27256   // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27257   // same way but in bits 14:13.
27258   if (Subtarget.hasSSE1()) {
27259     // Store MXCSR into memory.
27260     Chain = DAG.getNode(
27261         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27262         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27263         StackSlot);
27264 
27265     // Load MXCSR from stack slot and clear RM field (bits 14:13).
27266     SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27267     Chain = CWD.getValue(1);
27268     CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27269                       DAG.getConstant(0xffff9fff, DL, MVT::i32));
27270 
27271     // Shift X87 RM bits from 11:10 to 14:13.
27272     RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27273     RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27274                          DAG.getConstant(3, DL, MVT::i8));
27275 
27276     // Update rounding mode bits and store the new FP Control Word into stack.
27277     CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27278     Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27279 
27280     // Load MXCSR from the slot.
27281     Chain = DAG.getNode(
27282         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27283         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27284         StackSlot);
27285   }
27286 
27287   return Chain;
27288 }
27289 
27290 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
27291 //
27292 // i8/i16 vector implemented using dword LZCNT vector instruction
27293 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27294 // split the vector, perform operation on it's Lo a Hi part and
27295 // concatenate the results.
27296 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27297                                          const X86Subtarget &Subtarget) {
27298   assert(Op.getOpcode() == ISD::CTLZ);
27299   SDLoc dl(Op);
27300   MVT VT = Op.getSimpleValueType();
27301   MVT EltVT = VT.getVectorElementType();
27302   unsigned NumElems = VT.getVectorNumElements();
27303 
27304   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27305           "Unsupported element type");
27306 
27307   // Split vector, it's Lo and Hi parts will be handled in next iteration.
27308   if (NumElems > 16 ||
27309       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27310     return splitVectorIntUnary(Op, DAG);
27311 
27312   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27313   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27314           "Unsupported value type for operation");
27315 
27316   // Use native supported vector instruction vplzcntd.
27317   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27318   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27319   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27320   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27321 
27322   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27323 }
27324 
27325 // Lower CTLZ using a PSHUFB lookup table implementation.
27326 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27327                                        const X86Subtarget &Subtarget,
27328                                        SelectionDAG &DAG) {
27329   MVT VT = Op.getSimpleValueType();
27330   int NumElts = VT.getVectorNumElements();
27331   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27332   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27333 
27334   // Per-nibble leading zero PSHUFB lookup table.
27335   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27336                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27337                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27338                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27339 
27340   SmallVector<SDValue, 64> LUTVec;
27341   for (int i = 0; i < NumBytes; ++i)
27342     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27343   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27344 
27345   // Begin by bitcasting the input to byte vector, then split those bytes
27346   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27347   // If the hi input nibble is zero then we add both results together, otherwise
27348   // we just take the hi result (by masking the lo result to zero before the
27349   // add).
27350   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27351   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27352 
27353   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27354   SDValue Lo = Op0;
27355   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27356   SDValue HiZ;
27357   if (CurrVT.is512BitVector()) {
27358     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27359     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27360     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27361   } else {
27362     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27363   }
27364 
27365   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27366   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27367   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27368   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27369 
27370   // Merge result back from vXi8 back to VT, working on the lo/hi halves
27371   // of the current vector width in the same way we did for the nibbles.
27372   // If the upper half of the input element is zero then add the halves'
27373   // leading zero counts together, otherwise just use the upper half's.
27374   // Double the width of the result until we are at target width.
27375   while (CurrVT != VT) {
27376     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27377     int CurrNumElts = CurrVT.getVectorNumElements();
27378     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27379     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27380     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27381 
27382     // Check if the upper half of the input element is zero.
27383     if (CurrVT.is512BitVector()) {
27384       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27385       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27386                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27387       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27388     } else {
27389       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27390                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27391     }
27392     HiZ = DAG.getBitcast(NextVT, HiZ);
27393 
27394     // Move the upper/lower halves to the lower bits as we'll be extending to
27395     // NextVT. Mask the lower result to zero if HiZ is true and add the results
27396     // together.
27397     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27398     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27399     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27400     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27401     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27402     CurrVT = NextVT;
27403   }
27404 
27405   return Res;
27406 }
27407 
27408 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27409                                const X86Subtarget &Subtarget,
27410                                SelectionDAG &DAG) {
27411   MVT VT = Op.getSimpleValueType();
27412 
27413   if (Subtarget.hasCDI() &&
27414       // vXi8 vectors need to be promoted to 512-bits for vXi32.
27415       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27416     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27417 
27418   // Decompose 256-bit ops into smaller 128-bit ops.
27419   if (VT.is256BitVector() && !Subtarget.hasInt256())
27420     return splitVectorIntUnary(Op, DAG);
27421 
27422   // Decompose 512-bit ops into smaller 256-bit ops.
27423   if (VT.is512BitVector() && !Subtarget.hasBWI())
27424     return splitVectorIntUnary(Op, DAG);
27425 
27426   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27427   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27428 }
27429 
27430 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27431                          SelectionDAG &DAG) {
27432   MVT VT = Op.getSimpleValueType();
27433   MVT OpVT = VT;
27434   unsigned NumBits = VT.getSizeInBits();
27435   SDLoc dl(Op);
27436   unsigned Opc = Op.getOpcode();
27437 
27438   if (VT.isVector())
27439     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27440 
27441   Op = Op.getOperand(0);
27442   if (VT == MVT::i8) {
27443     // Zero extend to i32 since there is not an i8 bsr.
27444     OpVT = MVT::i32;
27445     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27446   }
27447 
27448   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27449   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27450   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27451 
27452   if (Opc == ISD::CTLZ) {
27453     // If src is zero (i.e. bsr sets ZF), returns NumBits.
27454     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27455                      DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27456                      Op.getValue(1)};
27457     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27458   }
27459 
27460   // Finally xor with NumBits-1.
27461   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27462                    DAG.getConstant(NumBits - 1, dl, OpVT));
27463 
27464   if (VT == MVT::i8)
27465     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27466   return Op;
27467 }
27468 
27469 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27470                          SelectionDAG &DAG) {
27471   MVT VT = Op.getSimpleValueType();
27472   unsigned NumBits = VT.getScalarSizeInBits();
27473   SDValue N0 = Op.getOperand(0);
27474   SDLoc dl(Op);
27475 
27476   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
27477          "Only scalar CTTZ requires custom lowering");
27478 
27479   // Issue a bsf (scan bits forward) which also sets EFLAGS.
27480   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27481   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27482 
27483   // If src is zero (i.e. bsf sets ZF), returns NumBits.
27484   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27485                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27486                    Op.getValue(1)};
27487   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27488 }
27489 
27490 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27491                            const X86Subtarget &Subtarget) {
27492   MVT VT = Op.getSimpleValueType();
27493   if (VT == MVT::i16 || VT == MVT::i32)
27494     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27495 
27496   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27497     return splitVectorIntBinary(Op, DAG);
27498 
27499   assert(Op.getSimpleValueType().is256BitVector() &&
27500          Op.getSimpleValueType().isInteger() &&
27501          "Only handle AVX 256-bit vector integer operation");
27502   return splitVectorIntBinary(Op, DAG);
27503 }
27504 
27505 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27506                                   const X86Subtarget &Subtarget) {
27507   MVT VT = Op.getSimpleValueType();
27508   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27509   unsigned Opcode = Op.getOpcode();
27510   SDLoc DL(Op);
27511 
27512   if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27513       (VT.is256BitVector() && !Subtarget.hasInt256())) {
27514     assert(Op.getSimpleValueType().isInteger() &&
27515            "Only handle AVX vector integer operation");
27516     return splitVectorIntBinary(Op, DAG);
27517   }
27518 
27519   // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27520   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27521   EVT SetCCResultType =
27522       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27523 
27524   if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27525     // usubsat X, Y --> (X >u Y) ? X - Y : 0
27526     SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27527     SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27528     // TODO: Move this to DAGCombiner?
27529     if (SetCCResultType == VT &&
27530         DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27531       return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27532     return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27533   }
27534 
27535   // Use default expansion.
27536   return SDValue();
27537 }
27538 
27539 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27540                         SelectionDAG &DAG) {
27541   MVT VT = Op.getSimpleValueType();
27542   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27543     // Since X86 does not have CMOV for 8-bit integer, we don't convert
27544     // 8-bit integer abs to NEG and CMOV.
27545     SDLoc DL(Op);
27546     SDValue N0 = Op.getOperand(0);
27547     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27548                               DAG.getConstant(0, DL, VT), N0);
27549     SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27550                      SDValue(Neg.getNode(), 1)};
27551     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27552   }
27553 
27554   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27555   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27556     SDLoc DL(Op);
27557     SDValue Src = Op.getOperand(0);
27558     SDValue Sub =
27559         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27560     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27561   }
27562 
27563   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27564     assert(VT.isInteger() &&
27565            "Only handle AVX 256-bit vector integer operation");
27566     return splitVectorIntUnary(Op, DAG);
27567   }
27568 
27569   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27570     return splitVectorIntUnary(Op, DAG);
27571 
27572   // Default to expand.
27573   return SDValue();
27574 }
27575 
27576 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27577   MVT VT = Op.getSimpleValueType();
27578 
27579   // For AVX1 cases, split to use legal ops (everything but v4i64).
27580   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27581     return splitVectorIntBinary(Op, DAG);
27582 
27583   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27584     return splitVectorIntBinary(Op, DAG);
27585 
27586   // Default to expand.
27587   return SDValue();
27588 }
27589 
27590 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27591                         SelectionDAG &DAG) {
27592   SDLoc dl(Op);
27593   MVT VT = Op.getSimpleValueType();
27594 
27595   // Decompose 256-bit ops into 128-bit ops.
27596   if (VT.is256BitVector() && !Subtarget.hasInt256())
27597     return splitVectorIntBinary(Op, DAG);
27598 
27599   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27600     return splitVectorIntBinary(Op, DAG);
27601 
27602   SDValue A = Op.getOperand(0);
27603   SDValue B = Op.getOperand(1);
27604 
27605   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27606   // vector pairs, multiply and truncate.
27607   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27608     unsigned NumElts = VT.getVectorNumElements();
27609 
27610     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27611         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27612       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27613       return DAG.getNode(
27614           ISD::TRUNCATE, dl, VT,
27615           DAG.getNode(ISD::MUL, dl, ExVT,
27616                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27617                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27618     }
27619 
27620     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27621 
27622     // Extract the lo/hi parts to any extend to i16.
27623     // We're going to mask off the low byte of each result element of the
27624     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27625     // element.
27626     SDValue Undef = DAG.getUNDEF(VT);
27627     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27628     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27629 
27630     SDValue BLo, BHi;
27631     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27632       // If the RHS is a constant, manually unpackl/unpackh.
27633       SmallVector<SDValue, 16> LoOps, HiOps;
27634       for (unsigned i = 0; i != NumElts; i += 16) {
27635         for (unsigned j = 0; j != 8; ++j) {
27636           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27637                                                MVT::i16));
27638           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27639                                                MVT::i16));
27640         }
27641       }
27642 
27643       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27644       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27645     } else {
27646       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27647       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27648     }
27649 
27650     // Multiply, mask the lower 8bits of the lo/hi results and pack.
27651     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27652     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27653     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27654     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27655     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27656   }
27657 
27658   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27659   if (VT == MVT::v4i32) {
27660     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
27661            "Should not custom lower when pmulld is available!");
27662 
27663     // Extract the odd parts.
27664     static const int UnpackMask[] = { 1, -1, 3, -1 };
27665     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27666     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27667 
27668     // Multiply the even parts.
27669     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27670                                 DAG.getBitcast(MVT::v2i64, A),
27671                                 DAG.getBitcast(MVT::v2i64, B));
27672     // Now multiply odd parts.
27673     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27674                                DAG.getBitcast(MVT::v2i64, Aodds),
27675                                DAG.getBitcast(MVT::v2i64, Bodds));
27676 
27677     Evens = DAG.getBitcast(VT, Evens);
27678     Odds = DAG.getBitcast(VT, Odds);
27679 
27680     // Merge the two vectors back together with a shuffle. This expands into 2
27681     // shuffles.
27682     static const int ShufMask[] = { 0, 4, 2, 6 };
27683     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27684   }
27685 
27686   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
27687          "Only know how to lower V2I64/V4I64/V8I64 multiply");
27688   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
27689 
27690   //  Ahi = psrlqi(a, 32);
27691   //  Bhi = psrlqi(b, 32);
27692   //
27693   //  AloBlo = pmuludq(a, b);
27694   //  AloBhi = pmuludq(a, Bhi);
27695   //  AhiBlo = pmuludq(Ahi, b);
27696   //
27697   //  Hi = psllqi(AloBhi + AhiBlo, 32);
27698   //  return AloBlo + Hi;
27699   KnownBits AKnown = DAG.computeKnownBits(A);
27700   KnownBits BKnown = DAG.computeKnownBits(B);
27701 
27702   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27703   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27704   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27705 
27706   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27707   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27708   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27709 
27710   SDValue Zero = DAG.getConstant(0, dl, VT);
27711 
27712   // Only multiply lo/hi halves that aren't known to be zero.
27713   SDValue AloBlo = Zero;
27714   if (!ALoIsZero && !BLoIsZero)
27715     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27716 
27717   SDValue AloBhi = Zero;
27718   if (!ALoIsZero && !BHiIsZero) {
27719     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27720     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27721   }
27722 
27723   SDValue AhiBlo = Zero;
27724   if (!AHiIsZero && !BLoIsZero) {
27725     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27726     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27727   }
27728 
27729   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27730   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27731 
27732   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27733 }
27734 
27735 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27736                                      MVT VT, bool IsSigned,
27737                                      const X86Subtarget &Subtarget,
27738                                      SelectionDAG &DAG,
27739                                      SDValue *Low = nullptr) {
27740   unsigned NumElts = VT.getVectorNumElements();
27741 
27742   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27743   // to a vXi16 type. Do the multiplies, shift the results and pack the half
27744   // lane results back together.
27745 
27746   // We'll take different approaches for signed and unsigned.
27747   // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27748   // and use pmullw to calculate the full 16-bit product.
27749   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27750   // shift them left into the upper byte of each word. This allows us to use
27751   // pmulhw to calculate the full 16-bit product. This trick means we don't
27752   // need to sign extend the bytes to use pmullw.
27753 
27754   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27755   SDValue Zero = DAG.getConstant(0, dl, VT);
27756 
27757   SDValue ALo, AHi;
27758   if (IsSigned) {
27759     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27760     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27761   } else {
27762     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27763     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27764   }
27765 
27766   SDValue BLo, BHi;
27767   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27768     // If the RHS is a constant, manually unpackl/unpackh and extend.
27769     SmallVector<SDValue, 16> LoOps, HiOps;
27770     for (unsigned i = 0; i != NumElts; i += 16) {
27771       for (unsigned j = 0; j != 8; ++j) {
27772         SDValue LoOp = B.getOperand(i + j);
27773         SDValue HiOp = B.getOperand(i + j + 8);
27774 
27775         if (IsSigned) {
27776           LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27777           HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27778           LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27779                              DAG.getConstant(8, dl, MVT::i16));
27780           HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27781                              DAG.getConstant(8, dl, MVT::i16));
27782         } else {
27783           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27784           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27785         }
27786 
27787         LoOps.push_back(LoOp);
27788         HiOps.push_back(HiOp);
27789       }
27790     }
27791 
27792     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27793     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27794   } else if (IsSigned) {
27795     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27796     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27797   } else {
27798     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27799     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27800   }
27801 
27802   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27803   // pack back to vXi8.
27804   unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27805   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27806   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27807 
27808   if (Low) {
27809     // Mask the lower bits and pack the results to rejoin the halves.
27810     SDValue Mask = DAG.getConstant(255, dl, ExVT);
27811     SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27812     SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27813     *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27814   }
27815 
27816   RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27817   RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27818 
27819   // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27820   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27821 }
27822 
27823 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27824                          SelectionDAG &DAG) {
27825   SDLoc dl(Op);
27826   MVT VT = Op.getSimpleValueType();
27827   bool IsSigned = Op->getOpcode() == ISD::MULHS;
27828   unsigned NumElts = VT.getVectorNumElements();
27829   SDValue A = Op.getOperand(0);
27830   SDValue B = Op.getOperand(1);
27831 
27832   // Decompose 256-bit ops into 128-bit ops.
27833   if (VT.is256BitVector() && !Subtarget.hasInt256())
27834     return splitVectorIntBinary(Op, DAG);
27835 
27836   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27837     return splitVectorIntBinary(Op, DAG);
27838 
27839   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27840     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
27841            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
27842            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
27843 
27844     // PMULxD operations multiply each even value (starting at 0) of LHS with
27845     // the related value of RHS and produce a widen result.
27846     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27847     // => <2 x i64> <ae|cg>
27848     //
27849     // In other word, to have all the results, we need to perform two PMULxD:
27850     // 1. one with the even values.
27851     // 2. one with the odd values.
27852     // To achieve #2, with need to place the odd values at an even position.
27853     //
27854     // Place the odd value at an even position (basically, shift all values 1
27855     // step to the left):
27856     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
27857                         9, -1, 11, -1, 13, -1, 15, -1};
27858     // <a|b|c|d> => <b|undef|d|undef>
27859     SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27860                                         makeArrayRef(&Mask[0], NumElts));
27861     // <e|f|g|h> => <f|undef|h|undef>
27862     SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27863                                         makeArrayRef(&Mask[0], NumElts));
27864 
27865     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27866     // ints.
27867     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27868     unsigned Opcode =
27869         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27870     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27871     // => <2 x i64> <ae|cg>
27872     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27873                                                   DAG.getBitcast(MulVT, A),
27874                                                   DAG.getBitcast(MulVT, B)));
27875     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27876     // => <2 x i64> <bf|dh>
27877     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27878                                                   DAG.getBitcast(MulVT, Odd0),
27879                                                   DAG.getBitcast(MulVT, Odd1)));
27880 
27881     // Shuffle it back into the right order.
27882     SmallVector<int, 16> ShufMask(NumElts);
27883     for (int i = 0; i != (int)NumElts; ++i)
27884       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27885 
27886     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27887 
27888     // If we have a signed multiply but no PMULDQ fix up the result of an
27889     // unsigned multiply.
27890     if (IsSigned && !Subtarget.hasSSE41()) {
27891       SDValue Zero = DAG.getConstant(0, dl, VT);
27892       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27893                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27894       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27895                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27896 
27897       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27898       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27899     }
27900 
27901     return Res;
27902   }
27903 
27904   // Only i8 vectors should need custom lowering after this.
27905   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
27906          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
27907          "Unsupported vector type");
27908 
27909   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27910   // logical shift down the upper half and pack back to i8.
27911 
27912   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27913   // and then ashr/lshr the upper bits down to the lower bits before multiply.
27914 
27915   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27916       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27917     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27918     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27919     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27920     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27921     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27922     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27923     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27924   }
27925 
27926   return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27927 }
27928 
27929 // Custom lowering for SMULO/UMULO.
27930 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27931                          SelectionDAG &DAG) {
27932   MVT VT = Op.getSimpleValueType();
27933 
27934   // Scalars defer to LowerXALUO.
27935   if (!VT.isVector())
27936     return LowerXALUO(Op, DAG);
27937 
27938   SDLoc dl(Op);
27939   bool IsSigned = Op->getOpcode() == ISD::SMULO;
27940   SDValue A = Op.getOperand(0);
27941   SDValue B = Op.getOperand(1);
27942   EVT OvfVT = Op->getValueType(1);
27943 
27944   if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27945       (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27946     // Extract the LHS Lo/Hi vectors
27947     SDValue LHSLo, LHSHi;
27948     std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27949 
27950     // Extract the RHS Lo/Hi vectors
27951     SDValue RHSLo, RHSHi;
27952     std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27953 
27954     EVT LoOvfVT, HiOvfVT;
27955     std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27956     SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27957     SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27958 
27959     // Issue the split operations.
27960     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27961     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27962 
27963     // Join the separate data results and the overflow results.
27964     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27965     SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27966                               Hi.getValue(1));
27967 
27968     return DAG.getMergeValues({Res, Ovf}, dl);
27969   }
27970 
27971   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27972   EVT SetccVT =
27973       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27974 
27975   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27976       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27977     unsigned NumElts = VT.getVectorNumElements();
27978     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27979     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27980     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27981     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27982     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27983 
27984     SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27985 
27986     SDValue Ovf;
27987     if (IsSigned) {
27988       SDValue High, LowSign;
27989       if (OvfVT.getVectorElementType() == MVT::i1 &&
27990           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27991         // Rather the truncating try to do the compare on vXi16 or vXi32.
27992         // Shift the high down filling with sign bits.
27993         High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
27994         // Fill all 16 bits with the sign bit from the low.
27995         LowSign =
27996             getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
27997         LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
27998                                              15, DAG);
27999         SetccVT = OvfVT;
28000         if (!Subtarget.hasBWI()) {
28001           // We can't do a vXi16 compare so sign extend to v16i32.
28002           High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28003           LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28004         }
28005       } else {
28006         // Otherwise do the compare at vXi8.
28007         High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28008         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28009         LowSign =
28010             DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28011       }
28012 
28013       Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28014     } else {
28015       SDValue High =
28016           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28017       if (OvfVT.getVectorElementType() == MVT::i1 &&
28018           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28019         // Rather the truncating try to do the compare on vXi16 or vXi32.
28020         SetccVT = OvfVT;
28021         if (!Subtarget.hasBWI()) {
28022           // We can't do a vXi16 compare so sign extend to v16i32.
28023           High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28024         }
28025       } else {
28026         // Otherwise do the compare at vXi8.
28027         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28028       }
28029 
28030       Ovf =
28031           DAG.getSetCC(dl, SetccVT, High,
28032                        DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28033     }
28034 
28035     Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28036 
28037     return DAG.getMergeValues({Low, Ovf}, dl);
28038   }
28039 
28040   SDValue Low;
28041   SDValue High =
28042       LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28043 
28044   SDValue Ovf;
28045   if (IsSigned) {
28046     // SMULO overflows if the high bits don't match the sign of the low.
28047     SDValue LowSign =
28048         DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28049     Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28050   } else {
28051     // UMULO overflows if the high bits are non-zero.
28052     Ovf =
28053         DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28054   }
28055 
28056   Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28057 
28058   return DAG.getMergeValues({Low, Ovf}, dl);
28059 }
28060 
28061 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28062   assert(Subtarget.isTargetWin64() && "Unexpected target");
28063   EVT VT = Op.getValueType();
28064   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28065          "Unexpected return type for lowering");
28066 
28067   RTLIB::Libcall LC;
28068   bool isSigned;
28069   switch (Op->getOpcode()) {
28070   default: llvm_unreachable("Unexpected request for libcall!");
28071   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
28072   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
28073   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
28074   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
28075   }
28076 
28077   SDLoc dl(Op);
28078   SDValue InChain = DAG.getEntryNode();
28079 
28080   TargetLowering::ArgListTy Args;
28081   TargetLowering::ArgListEntry Entry;
28082   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28083     EVT ArgVT = Op->getOperand(i).getValueType();
28084     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28085            "Unexpected argument type for lowering");
28086     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28087     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28088     MachinePointerInfo MPI =
28089         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28090     Entry.Node = StackPtr;
28091     InChain =
28092         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28093     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28094     Entry.Ty = PointerType::get(ArgTy,0);
28095     Entry.IsSExt = false;
28096     Entry.IsZExt = false;
28097     Args.push_back(Entry);
28098   }
28099 
28100   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28101                                          getPointerTy(DAG.getDataLayout()));
28102 
28103   TargetLowering::CallLoweringInfo CLI(DAG);
28104   CLI.setDebugLoc(dl)
28105       .setChain(InChain)
28106       .setLibCallee(
28107           getLibcallCallingConv(LC),
28108           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28109           std::move(Args))
28110       .setInRegister()
28111       .setSExtResult(isSigned)
28112       .setZExtResult(!isSigned);
28113 
28114   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28115   return DAG.getBitcast(VT, CallInfo.first);
28116 }
28117 
28118 // Return true if the required (according to Opcode) shift-imm form is natively
28119 // supported by the Subtarget
28120 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28121                                         unsigned Opcode) {
28122   if (VT.getScalarSizeInBits() < 16)
28123     return false;
28124 
28125   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28126       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28127     return true;
28128 
28129   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28130                 (VT.is256BitVector() && Subtarget.hasInt256());
28131 
28132   bool AShift = LShift && (Subtarget.hasAVX512() ||
28133                            (VT != MVT::v2i64 && VT != MVT::v4i64));
28134   return (Opcode == ISD::SRA) ? AShift : LShift;
28135 }
28136 
28137 // The shift amount is a variable, but it is the same for all vector lanes.
28138 // These instructions are defined together with shift-immediate.
28139 static
28140 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28141                                       unsigned Opcode) {
28142   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28143 }
28144 
28145 // Return true if the required (according to Opcode) variable-shift form is
28146 // natively supported by the Subtarget
28147 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28148                                     unsigned Opcode) {
28149 
28150   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28151     return false;
28152 
28153   // vXi16 supported only on AVX-512, BWI
28154   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28155     return false;
28156 
28157   if (Subtarget.hasAVX512())
28158     return true;
28159 
28160   bool LShift = VT.is128BitVector() || VT.is256BitVector();
28161   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
28162   return (Opcode == ISD::SRA) ? AShift : LShift;
28163 }
28164 
28165 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28166                                          const X86Subtarget &Subtarget) {
28167   MVT VT = Op.getSimpleValueType();
28168   SDLoc dl(Op);
28169   SDValue R = Op.getOperand(0);
28170   SDValue Amt = Op.getOperand(1);
28171   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28172 
28173   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28174     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
28175     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28176     SDValue Ex = DAG.getBitcast(ExVT, R);
28177 
28178     // ashr(R, 63) === cmp_slt(R, 0)
28179     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28180       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
28181              "Unsupported PCMPGT op");
28182       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28183     }
28184 
28185     if (ShiftAmt >= 32) {
28186       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28187       SDValue Upper =
28188           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28189       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28190                                                  ShiftAmt - 32, DAG);
28191       if (VT == MVT::v2i64)
28192         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28193       if (VT == MVT::v4i64)
28194         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28195                                   {9, 1, 11, 3, 13, 5, 15, 7});
28196     } else {
28197       // SRA upper i32, SRL whole i64 and select lower i32.
28198       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28199                                                  ShiftAmt, DAG);
28200       SDValue Lower =
28201           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28202       Lower = DAG.getBitcast(ExVT, Lower);
28203       if (VT == MVT::v2i64)
28204         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28205       if (VT == MVT::v4i64)
28206         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28207                                   {8, 1, 10, 3, 12, 5, 14, 7});
28208     }
28209     return DAG.getBitcast(VT, Ex);
28210   };
28211 
28212   // Optimize shl/srl/sra with constant shift amount.
28213   APInt APIntShiftAmt;
28214   if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28215     return SDValue();
28216 
28217   // If the shift amount is out of range, return undef.
28218   if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28219     return DAG.getUNDEF(VT);
28220 
28221   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28222 
28223   if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28224     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28225 
28226   // i64 SRA needs to be performed as partial shifts.
28227   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28228        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28229       Op.getOpcode() == ISD::SRA)
28230     return ArithmeticShiftRight64(ShiftAmt);
28231 
28232   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28233       (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28234     unsigned NumElts = VT.getVectorNumElements();
28235     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28236 
28237     // Simple i8 add case
28238     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28239       return DAG.getNode(ISD::ADD, dl, VT, R, R);
28240 
28241     // ashr(R, 7)  === cmp_slt(R, 0)
28242     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28243       SDValue Zeros = DAG.getConstant(0, dl, VT);
28244       if (VT.is512BitVector()) {
28245         assert(VT == MVT::v64i8 && "Unexpected element type!");
28246         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28247         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28248       }
28249       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28250     }
28251 
28252     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28253     if (VT == MVT::v16i8 && Subtarget.hasXOP())
28254       return SDValue();
28255 
28256     if (Op.getOpcode() == ISD::SHL) {
28257       // Make a large shift.
28258       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28259                                                ShiftAmt, DAG);
28260       SHL = DAG.getBitcast(VT, SHL);
28261       // Zero out the rightmost bits.
28262       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28263       return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28264     }
28265     if (Op.getOpcode() == ISD::SRL) {
28266       // Make a large shift.
28267       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28268                                                ShiftAmt, DAG);
28269       SRL = DAG.getBitcast(VT, SRL);
28270       // Zero out the leftmost bits.
28271       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28272       return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28273     }
28274     if (Op.getOpcode() == ISD::SRA) {
28275       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28276       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28277 
28278       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28279       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28280       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28281       return Res;
28282     }
28283     llvm_unreachable("Unknown shift opcode.");
28284   }
28285 
28286   return SDValue();
28287 }
28288 
28289 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28290                                         const X86Subtarget &Subtarget) {
28291   MVT VT = Op.getSimpleValueType();
28292   SDLoc dl(Op);
28293   SDValue R = Op.getOperand(0);
28294   SDValue Amt = Op.getOperand(1);
28295   unsigned Opcode = Op.getOpcode();
28296   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28297   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28298 
28299   if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28300     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28301       MVT EltVT = VT.getVectorElementType();
28302       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
28303       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28304         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28305       else if (EltVT.bitsLT(MVT::i32))
28306         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28307 
28308       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28309     }
28310 
28311     // vXi8 shifts - shift as v8i16 + mask result.
28312     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28313          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28314          VT == MVT::v64i8) &&
28315         !Subtarget.hasXOP()) {
28316       unsigned NumElts = VT.getVectorNumElements();
28317       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28318       if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28319         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28320         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28321         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28322 
28323         // Create the mask using vXi16 shifts. For shift-rights we need to move
28324         // the upper byte down before splatting the vXi8 mask.
28325         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28326         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28327                                       BaseShAmt, Subtarget, DAG);
28328         if (Opcode != ISD::SHL)
28329           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28330                                                8, DAG);
28331         BitMask = DAG.getBitcast(VT, BitMask);
28332         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28333                                        SmallVector<int, 64>(NumElts, 0));
28334 
28335         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28336                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
28337                                           Subtarget, DAG);
28338         Res = DAG.getBitcast(VT, Res);
28339         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28340 
28341         if (Opcode == ISD::SRA) {
28342           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28343           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28344           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28345           SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28346                                          BaseShAmt, Subtarget, DAG);
28347           SignMask = DAG.getBitcast(VT, SignMask);
28348           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28349           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28350         }
28351         return Res;
28352       }
28353     }
28354   }
28355 
28356   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28357   if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28358       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28359     Amt = Amt.getOperand(0);
28360     unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28361     std::vector<SDValue> Vals(Ratio);
28362     for (unsigned i = 0; i != Ratio; ++i)
28363       Vals[i] = Amt.getOperand(i);
28364     for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28365       for (unsigned j = 0; j != Ratio; ++j)
28366         if (Vals[j] != Amt.getOperand(i + j))
28367           return SDValue();
28368     }
28369 
28370     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28371       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28372   }
28373   return SDValue();
28374 }
28375 
28376 // Convert a shift/rotate left amount to a multiplication scale factor.
28377 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28378                                        const X86Subtarget &Subtarget,
28379                                        SelectionDAG &DAG) {
28380   MVT VT = Amt.getSimpleValueType();
28381   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28382         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28383         (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28384         (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28385     return SDValue();
28386 
28387   if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28388     SmallVector<SDValue, 8> Elts;
28389     MVT SVT = VT.getVectorElementType();
28390     unsigned SVTBits = SVT.getSizeInBits();
28391     APInt One(SVTBits, 1);
28392     unsigned NumElems = VT.getVectorNumElements();
28393 
28394     for (unsigned i = 0; i != NumElems; ++i) {
28395       SDValue Op = Amt->getOperand(i);
28396       if (Op->isUndef()) {
28397         Elts.push_back(Op);
28398         continue;
28399       }
28400 
28401       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28402       APInt C(SVTBits, ND->getZExtValue());
28403       uint64_t ShAmt = C.getZExtValue();
28404       if (ShAmt >= SVTBits) {
28405         Elts.push_back(DAG.getUNDEF(SVT));
28406         continue;
28407       }
28408       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28409     }
28410     return DAG.getBuildVector(VT, dl, Elts);
28411   }
28412 
28413   // If the target doesn't support variable shifts, use either FP conversion
28414   // or integer multiplication to avoid shifting each element individually.
28415   if (VT == MVT::v4i32) {
28416     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28417     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28418                       DAG.getConstant(0x3f800000U, dl, VT));
28419     Amt = DAG.getBitcast(MVT::v4f32, Amt);
28420     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28421   }
28422 
28423   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28424   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28425     SDValue Z = DAG.getConstant(0, dl, VT);
28426     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28427     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28428     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28429     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28430     if (Subtarget.hasSSE41())
28431       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28432 
28433     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28434                                         DAG.getBitcast(VT, Hi),
28435                                         {0, 2, 4, 6, 8, 10, 12, 14});
28436   }
28437 
28438   return SDValue();
28439 }
28440 
28441 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28442                           SelectionDAG &DAG) {
28443   MVT VT = Op.getSimpleValueType();
28444   SDLoc dl(Op);
28445   SDValue R = Op.getOperand(0);
28446   SDValue Amt = Op.getOperand(1);
28447   unsigned EltSizeInBits = VT.getScalarSizeInBits();
28448   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28449 
28450   unsigned Opc = Op.getOpcode();
28451   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28452   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28453 
28454   assert(VT.isVector() && "Custom lowering only for vector shifts!");
28455   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
28456 
28457   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28458     return V;
28459 
28460   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28461     return V;
28462 
28463   if (SupportedVectorVarShift(VT, Subtarget, Opc))
28464     return Op;
28465 
28466   // XOP has 128-bit variable logical/arithmetic shifts.
28467   // +ve/-ve Amt = shift left/right.
28468   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28469                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
28470     if (Opc == ISD::SRL || Opc == ISD::SRA) {
28471       SDValue Zero = DAG.getConstant(0, dl, VT);
28472       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28473     }
28474     if (Opc == ISD::SHL || Opc == ISD::SRL)
28475       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28476     if (Opc == ISD::SRA)
28477       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28478   }
28479 
28480   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28481   // shifts per-lane and then shuffle the partial results back together.
28482   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28483     // Splat the shift amounts so the scalar shifts above will catch it.
28484     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28485     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28486     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28487     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28488     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28489   }
28490 
28491   // i64 vector arithmetic shift can be emulated with the transform:
28492   // M = lshr(SIGN_MASK, Amt)
28493   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28494   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28495       Opc == ISD::SRA) {
28496     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28497     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28498     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28499     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28500     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28501     return R;
28502   }
28503 
28504   // If possible, lower this shift as a sequence of two shifts by
28505   // constant plus a BLENDing shuffle instead of scalarizing it.
28506   // Example:
28507   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28508   //
28509   // Could be rewritten as:
28510   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28511   //
28512   // The advantage is that the two shifts from the example would be
28513   // lowered as X86ISD::VSRLI nodes in parallel before blending.
28514   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28515                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28516     SDValue Amt1, Amt2;
28517     unsigned NumElts = VT.getVectorNumElements();
28518     SmallVector<int, 8> ShuffleMask;
28519     for (unsigned i = 0; i != NumElts; ++i) {
28520       SDValue A = Amt->getOperand(i);
28521       if (A.isUndef()) {
28522         ShuffleMask.push_back(SM_SentinelUndef);
28523         continue;
28524       }
28525       if (!Amt1 || Amt1 == A) {
28526         ShuffleMask.push_back(i);
28527         Amt1 = A;
28528         continue;
28529       }
28530       if (!Amt2 || Amt2 == A) {
28531         ShuffleMask.push_back(i + NumElts);
28532         Amt2 = A;
28533         continue;
28534       }
28535       break;
28536     }
28537 
28538     // Only perform this blend if we can perform it without loading a mask.
28539     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28540         (VT != MVT::v16i16 ||
28541          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28542         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28543          canWidenShuffleElements(ShuffleMask))) {
28544       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28545       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28546       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28547           Cst2->getAPIntValue().ult(EltSizeInBits)) {
28548         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28549                                                     Cst1->getZExtValue(), DAG);
28550         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28551                                                     Cst2->getZExtValue(), DAG);
28552         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28553       }
28554     }
28555   }
28556 
28557   // If possible, lower this packed shift into a vector multiply instead of
28558   // expanding it into a sequence of scalar shifts.
28559   if (Opc == ISD::SHL)
28560     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28561       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28562 
28563   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28564   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28565   if (Opc == ISD::SRL && ConstantAmt &&
28566       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28567     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28568     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28569     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28570       SDValue Zero = DAG.getConstant(0, dl, VT);
28571       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28572       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28573       return DAG.getSelect(dl, VT, ZAmt, R, Res);
28574     }
28575   }
28576 
28577   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28578   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28579   // TODO: Special case handling for shift by 0/1, really we can afford either
28580   // of these cases in pre-SSE41/XOP/AVX512 but not both.
28581   if (Opc == ISD::SRA && ConstantAmt &&
28582       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28583       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28584         !Subtarget.hasAVX512()) ||
28585        DAG.isKnownNeverZero(Amt))) {
28586     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28587     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28588     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28589       SDValue Amt0 =
28590           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28591       SDValue Amt1 =
28592           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28593       SDValue Sra1 =
28594           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28595       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28596       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28597       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28598     }
28599   }
28600 
28601   // v4i32 Non Uniform Shifts.
28602   // If the shift amount is constant we can shift each lane using the SSE2
28603   // immediate shifts, else we need to zero-extend each lane to the lower i64
28604   // and shift using the SSE2 variable shifts.
28605   // The separate results can then be blended together.
28606   if (VT == MVT::v4i32) {
28607     SDValue Amt0, Amt1, Amt2, Amt3;
28608     if (ConstantAmt) {
28609       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28610       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28611       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28612       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28613     } else {
28614       // The SSE2 shifts use the lower i64 as the same shift amount for
28615       // all lanes and the upper i64 is ignored. On AVX we're better off
28616       // just zero-extending, but for SSE just duplicating the top 16-bits is
28617       // cheaper and has the same effect for out of range values.
28618       if (Subtarget.hasAVX()) {
28619         SDValue Z = DAG.getConstant(0, dl, VT);
28620         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28621         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28622         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28623         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28624       } else {
28625         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28626         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28627                                              {4, 5, 6, 7, -1, -1, -1, -1});
28628         Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28629                                     {0, 1, 1, 1, -1, -1, -1, -1});
28630         Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28631                                     {2, 3, 3, 3, -1, -1, -1, -1});
28632         Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28633                                     {0, 1, 1, 1, -1, -1, -1, -1});
28634         Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28635                                     {2, 3, 3, 3, -1, -1, -1, -1});
28636       }
28637     }
28638 
28639     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28640     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28641     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28642     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28643     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28644 
28645     // Merge the shifted lane results optimally with/without PBLENDW.
28646     // TODO - ideally shuffle combining would handle this.
28647     if (Subtarget.hasSSE41()) {
28648       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28649       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28650       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28651     }
28652     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28653     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28654     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28655   }
28656 
28657   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28658   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28659   // make the existing SSE solution better.
28660   // NOTE: We honor prefered vector width before promoting to 512-bits.
28661   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28662       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28663       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28664       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28665       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28666     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
28667            "Unexpected vector type");
28668     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28669     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28670     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28671     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28672     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28673     return DAG.getNode(ISD::TRUNCATE, dl, VT,
28674                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
28675   }
28676 
28677   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28678   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28679   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28680       (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28681        (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28682       !Subtarget.hasXOP()) {
28683     int NumElts = VT.getVectorNumElements();
28684     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28685 
28686     // Extend constant shift amount to vXi16 (it doesn't matter if the type
28687     // isn't legal).
28688     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28689     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28690     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28691     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28692     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
28693            "Constant build vector expected");
28694 
28695     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28696       R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28697                           : DAG.getZExtOrTrunc(R, dl, ExVT);
28698       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28699       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28700       return DAG.getZExtOrTrunc(R, dl, VT);
28701     }
28702 
28703     SmallVector<SDValue, 16> LoAmt, HiAmt;
28704     for (int i = 0; i != NumElts; i += 16) {
28705       for (int j = 0; j != 8; ++j) {
28706         LoAmt.push_back(Amt.getOperand(i + j));
28707         HiAmt.push_back(Amt.getOperand(i + j + 8));
28708       }
28709     }
28710 
28711     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28712     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28713     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28714 
28715     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28716     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28717     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28718     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28719     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28720     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28721     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28722     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28723     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28724   }
28725 
28726   if (VT == MVT::v16i8 ||
28727       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28728       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28729     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28730 
28731     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28732       if (VT.is512BitVector()) {
28733         // On AVX512BW targets we make use of the fact that VSELECT lowers
28734         // to a masked blend which selects bytes based just on the sign bit
28735         // extracted to a mask.
28736         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28737         V0 = DAG.getBitcast(VT, V0);
28738         V1 = DAG.getBitcast(VT, V1);
28739         Sel = DAG.getBitcast(VT, Sel);
28740         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28741                            ISD::SETGT);
28742         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28743       } else if (Subtarget.hasSSE41()) {
28744         // On SSE41 targets we can use PBLENDVB which selects bytes based just
28745         // on the sign bit.
28746         V0 = DAG.getBitcast(VT, V0);
28747         V1 = DAG.getBitcast(VT, V1);
28748         Sel = DAG.getBitcast(VT, Sel);
28749         return DAG.getBitcast(SelVT,
28750                               DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28751       }
28752       // On pre-SSE41 targets we test for the sign bit by comparing to
28753       // zero - a negative value will set all bits of the lanes to true
28754       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28755       SDValue Z = DAG.getConstant(0, dl, SelVT);
28756       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28757       return DAG.getSelect(dl, SelVT, C, V0, V1);
28758     };
28759 
28760     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28761     // We can safely do this using i16 shifts as we're only interested in
28762     // the 3 lower bits of each byte.
28763     Amt = DAG.getBitcast(ExtVT, Amt);
28764     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28765     Amt = DAG.getBitcast(VT, Amt);
28766 
28767     if (Opc == ISD::SHL || Opc == ISD::SRL) {
28768       // r = VSELECT(r, shift(r, 4), a);
28769       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28770       R = SignBitSelect(VT, Amt, M, R);
28771 
28772       // a += a
28773       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28774 
28775       // r = VSELECT(r, shift(r, 2), a);
28776       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28777       R = SignBitSelect(VT, Amt, M, R);
28778 
28779       // a += a
28780       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28781 
28782       // return VSELECT(r, shift(r, 1), a);
28783       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28784       R = SignBitSelect(VT, Amt, M, R);
28785       return R;
28786     }
28787 
28788     if (Opc == ISD::SRA) {
28789       // For SRA we need to unpack each byte to the higher byte of a i16 vector
28790       // so we can correctly sign extend. We don't care what happens to the
28791       // lower byte.
28792       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28793       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28794       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28795       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28796       ALo = DAG.getBitcast(ExtVT, ALo);
28797       AHi = DAG.getBitcast(ExtVT, AHi);
28798       RLo = DAG.getBitcast(ExtVT, RLo);
28799       RHi = DAG.getBitcast(ExtVT, RHi);
28800 
28801       // r = VSELECT(r, shift(r, 4), a);
28802       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28803       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28804       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28805       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28806 
28807       // a += a
28808       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28809       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28810 
28811       // r = VSELECT(r, shift(r, 2), a);
28812       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28813       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28814       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28815       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28816 
28817       // a += a
28818       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28819       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28820 
28821       // r = VSELECT(r, shift(r, 1), a);
28822       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28823       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28824       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28825       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28826 
28827       // Logical shift the result back to the lower byte, leaving a zero upper
28828       // byte meaning that we can safely pack with PACKUSWB.
28829       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28830       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28831       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28832     }
28833   }
28834 
28835   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28836     MVT ExtVT = MVT::v8i32;
28837     SDValue Z = DAG.getConstant(0, dl, VT);
28838     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28839     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28840     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28841     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28842     ALo = DAG.getBitcast(ExtVT, ALo);
28843     AHi = DAG.getBitcast(ExtVT, AHi);
28844     RLo = DAG.getBitcast(ExtVT, RLo);
28845     RHi = DAG.getBitcast(ExtVT, RHi);
28846     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28847     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28848     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28849     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28850     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28851   }
28852 
28853   if (VT == MVT::v8i16) {
28854     // If we have a constant shift amount, the non-SSE41 path is best as
28855     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28856     bool UseSSE41 = Subtarget.hasSSE41() &&
28857                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28858 
28859     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28860       // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28861       // the sign bit.
28862       if (UseSSE41) {
28863         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28864         V0 = DAG.getBitcast(ExtVT, V0);
28865         V1 = DAG.getBitcast(ExtVT, V1);
28866         Sel = DAG.getBitcast(ExtVT, Sel);
28867         return DAG.getBitcast(
28868             VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28869       }
28870       // On pre-SSE41 targets we splat the sign bit - a negative value will
28871       // set all bits of the lanes to true and VSELECT uses that in
28872       // its OR(AND(V0,C),AND(V1,~C)) lowering.
28873       SDValue C =
28874           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28875       return DAG.getSelect(dl, VT, C, V0, V1);
28876     };
28877 
28878     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28879     if (UseSSE41) {
28880       // On SSE41 targets we need to replicate the shift mask in both
28881       // bytes for PBLENDVB.
28882       Amt = DAG.getNode(
28883           ISD::OR, dl, VT,
28884           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28885           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28886     } else {
28887       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28888     }
28889 
28890     // r = VSELECT(r, shift(r, 8), a);
28891     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28892     R = SignBitSelect(Amt, M, R);
28893 
28894     // a += a
28895     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28896 
28897     // r = VSELECT(r, shift(r, 4), a);
28898     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28899     R = SignBitSelect(Amt, M, R);
28900 
28901     // a += a
28902     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28903 
28904     // r = VSELECT(r, shift(r, 2), a);
28905     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28906     R = SignBitSelect(Amt, M, R);
28907 
28908     // a += a
28909     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28910 
28911     // return VSELECT(r, shift(r, 1), a);
28912     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28913     R = SignBitSelect(Amt, M, R);
28914     return R;
28915   }
28916 
28917   // Decompose 256-bit shifts into 128-bit shifts.
28918   if (VT.is256BitVector())
28919     return splitVectorIntBinary(Op, DAG);
28920 
28921   if (VT == MVT::v32i16 || VT == MVT::v64i8)
28922     return splitVectorIntBinary(Op, DAG);
28923 
28924   return SDValue();
28925 }
28926 
28927 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28928                            SelectionDAG &DAG) {
28929   MVT VT = Op.getSimpleValueType();
28930   assert(VT.isVector() && "Custom lowering only for vector rotates!");
28931 
28932   SDLoc DL(Op);
28933   SDValue R = Op.getOperand(0);
28934   SDValue Amt = Op.getOperand(1);
28935   unsigned Opcode = Op.getOpcode();
28936   unsigned EltSizeInBits = VT.getScalarSizeInBits();
28937   int NumElts = VT.getVectorNumElements();
28938 
28939   // Check for constant splat rotation amount.
28940   APInt CstSplatValue;
28941   bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28942 
28943   // Check for splat rotate by zero.
28944   if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28945     return R;
28946 
28947   // AVX512 implicitly uses modulo rotation amounts.
28948   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28949     // Attempt to rotate by immediate.
28950     if (IsCstSplat) {
28951       unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28952       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28953       return DAG.getNode(RotOpc, DL, VT, R,
28954                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28955     }
28956 
28957     // Else, fall-back on VPROLV/VPRORV.
28958     return Op;
28959   }
28960 
28961   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28962   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28963     unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28964     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28965   }
28966 
28967   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
28968 
28969   // XOP has 128-bit vector variable + immediate rotates.
28970   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28971   // XOP implicitly uses modulo rotation amounts.
28972   if (Subtarget.hasXOP()) {
28973     if (VT.is256BitVector())
28974       return splitVectorIntBinary(Op, DAG);
28975     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
28976 
28977     // Attempt to rotate by immediate.
28978     if (IsCstSplat) {
28979       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28980       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28981                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28982     }
28983 
28984     // Use general rotate by variable (per-element).
28985     return Op;
28986   }
28987 
28988   // Split 256-bit integers on pre-AVX2 targets.
28989   if (VT.is256BitVector() && !Subtarget.hasAVX2())
28990     return splitVectorIntBinary(Op, DAG);
28991 
28992   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
28993           ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
28994             VT == MVT::v32i16) &&
28995            Subtarget.hasAVX2())) &&
28996          "Only vXi32/vXi16/vXi8 vector rotates supported");
28997 
28998   // Rotate by an uniform constant - expand back to shifts.
28999   if (IsCstSplat)
29000     return SDValue();
29001 
29002   bool IsSplatAmt = DAG.isSplatValue(Amt);
29003 
29004   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29005   // the amount bit.
29006   if (EltSizeInBits == 8 && !IsSplatAmt) {
29007     if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29008       return SDValue();
29009 
29010     // We don't need ModuloAmt here as we just peek at individual bits.
29011     MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29012 
29013     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29014       if (Subtarget.hasSSE41()) {
29015         // On SSE41 targets we can use PBLENDVB which selects bytes based just
29016         // on the sign bit.
29017         V0 = DAG.getBitcast(VT, V0);
29018         V1 = DAG.getBitcast(VT, V1);
29019         Sel = DAG.getBitcast(VT, Sel);
29020         return DAG.getBitcast(SelVT,
29021                               DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29022       }
29023       // On pre-SSE41 targets we test for the sign bit by comparing to
29024       // zero - a negative value will set all bits of the lanes to true
29025       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29026       SDValue Z = DAG.getConstant(0, DL, SelVT);
29027       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29028       return DAG.getSelect(DL, SelVT, C, V0, V1);
29029     };
29030 
29031     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29032     // We can safely do this using i16 shifts as we're only interested in
29033     // the 3 lower bits of each byte.
29034     Amt = DAG.getBitcast(ExtVT, Amt);
29035     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29036     Amt = DAG.getBitcast(VT, Amt);
29037 
29038     // r = VSELECT(r, rot(r, 4), a);
29039     SDValue M;
29040     M = DAG.getNode(
29041         ISD::OR, DL, VT,
29042         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29043         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29044     R = SignBitSelect(VT, Amt, M, R);
29045 
29046     // a += a
29047     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29048 
29049     // r = VSELECT(r, rot(r, 2), a);
29050     M = DAG.getNode(
29051         ISD::OR, DL, VT,
29052         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29053         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29054     R = SignBitSelect(VT, Amt, M, R);
29055 
29056     // a += a
29057     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29058 
29059     // return VSELECT(r, rot(r, 1), a);
29060     M = DAG.getNode(
29061         ISD::OR, DL, VT,
29062         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29063         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29064     return SignBitSelect(VT, Amt, M, R);
29065   }
29066 
29067   // ISD::ROT* uses modulo rotate amounts.
29068   if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29069     // If the amount is a splat, perform the modulo BEFORE the splat,
29070     // this helps LowerScalarVariableShift to remove the splat later.
29071     Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29072     Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29073                       DAG.getConstant(EltSizeInBits - 1, DL, VT));
29074     Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29075                                SmallVector<int>(NumElts, 0));
29076   } else {
29077     Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29078                       DAG.getConstant(EltSizeInBits - 1, DL, VT));
29079   }
29080 
29081   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29082   bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29083                         SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29084 
29085   // Fallback for splats + all supported variable shifts.
29086   // Fallback for non-constants AVX2 vXi16 as well.
29087   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29088     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29089     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29090     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29091     SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29092     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29093   }
29094 
29095   // As with shifts, convert the rotation amount to a multiplication factor.
29096   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29097   assert(Scale && "Failed to convert ROTL amount to scale");
29098 
29099   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29100   if (EltSizeInBits == 16) {
29101     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29102     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29103     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29104   }
29105 
29106   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29107   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29108   // that can then be OR'd with the lower 32-bits.
29109   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
29110   static const int OddMask[] = {1, -1, 3, -1};
29111   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29112   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29113 
29114   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29115                               DAG.getBitcast(MVT::v2i64, R),
29116                               DAG.getBitcast(MVT::v2i64, Scale));
29117   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29118                               DAG.getBitcast(MVT::v2i64, R13),
29119                               DAG.getBitcast(MVT::v2i64, Scale13));
29120   Res02 = DAG.getBitcast(VT, Res02);
29121   Res13 = DAG.getBitcast(VT, Res13);
29122 
29123   return DAG.getNode(ISD::OR, DL, VT,
29124                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29125                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29126 }
29127 
29128 /// Returns true if the operand type is exactly twice the native width, and
29129 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29130 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29131 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29132 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29133   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29134 
29135   if (OpWidth == 64)
29136     return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29137   if (OpWidth == 128)
29138     return Subtarget.hasCmpxchg16b();
29139 
29140   return false;
29141 }
29142 
29143 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29144   Type *MemType = SI->getValueOperand()->getType();
29145 
29146   bool NoImplicitFloatOps =
29147       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29148   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29149       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29150       (Subtarget.hasSSE1() || Subtarget.hasX87()))
29151     return false;
29152 
29153   return needsCmpXchgNb(MemType);
29154 }
29155 
29156 // Note: this turns large loads into lock cmpxchg8b/16b.
29157 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29158 TargetLowering::AtomicExpansionKind
29159 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29160   Type *MemType = LI->getType();
29161 
29162   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29163   // can use movq to do the load. If we have X87 we can load into an 80-bit
29164   // X87 register and store it to a stack temporary.
29165   bool NoImplicitFloatOps =
29166       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29167   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29168       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29169       (Subtarget.hasSSE1() || Subtarget.hasX87()))
29170     return AtomicExpansionKind::None;
29171 
29172   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29173                                  : AtomicExpansionKind::None;
29174 }
29175 
29176 TargetLowering::AtomicExpansionKind
29177 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29178   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29179   Type *MemType = AI->getType();
29180 
29181   // If the operand is too big, we must see if cmpxchg8/16b is available
29182   // and default to library calls otherwise.
29183   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29184     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29185                                    : AtomicExpansionKind::None;
29186   }
29187 
29188   AtomicRMWInst::BinOp Op = AI->getOperation();
29189   switch (Op) {
29190   default:
29191     llvm_unreachable("Unknown atomic operation");
29192   case AtomicRMWInst::Xchg:
29193   case AtomicRMWInst::Add:
29194   case AtomicRMWInst::Sub:
29195     // It's better to use xadd, xsub or xchg for these in all cases.
29196     return AtomicExpansionKind::None;
29197   case AtomicRMWInst::Or:
29198   case AtomicRMWInst::And:
29199   case AtomicRMWInst::Xor:
29200     // If the atomicrmw's result isn't actually used, we can just add a "lock"
29201     // prefix to a normal instruction for these operations.
29202     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29203                             : AtomicExpansionKind::None;
29204   case AtomicRMWInst::Nand:
29205   case AtomicRMWInst::Max:
29206   case AtomicRMWInst::Min:
29207   case AtomicRMWInst::UMax:
29208   case AtomicRMWInst::UMin:
29209   case AtomicRMWInst::FAdd:
29210   case AtomicRMWInst::FSub:
29211     // These always require a non-trivial set of data operations on x86. We must
29212     // use a cmpxchg loop.
29213     return AtomicExpansionKind::CmpXChg;
29214   }
29215 }
29216 
29217 LoadInst *
29218 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29219   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29220   Type *MemType = AI->getType();
29221   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29222   // there is no benefit in turning such RMWs into loads, and it is actually
29223   // harmful as it introduces a mfence.
29224   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29225     return nullptr;
29226 
29227   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29228   // lowering available in lowerAtomicArith.
29229   // TODO: push more cases through this path.
29230   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29231     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29232         AI->use_empty())
29233       return nullptr;
29234 
29235   IRBuilder<> Builder(AI);
29236   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29237   auto SSID = AI->getSyncScopeID();
29238   // We must restrict the ordering to avoid generating loads with Release or
29239   // ReleaseAcquire orderings.
29240   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29241 
29242   // Before the load we need a fence. Here is an example lifted from
29243   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29244   // is required:
29245   // Thread 0:
29246   //   x.store(1, relaxed);
29247   //   r1 = y.fetch_add(0, release);
29248   // Thread 1:
29249   //   y.fetch_add(42, acquire);
29250   //   r2 = x.load(relaxed);
29251   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29252   // lowered to just a load without a fence. A mfence flushes the store buffer,
29253   // making the optimization clearly correct.
29254   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29255   // otherwise, we might be able to be more aggressive on relaxed idempotent
29256   // rmw. In practice, they do not look useful, so we don't try to be
29257   // especially clever.
29258   if (SSID == SyncScope::SingleThread)
29259     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29260     // the IR level, so we must wrap it in an intrinsic.
29261     return nullptr;
29262 
29263   if (!Subtarget.hasMFence())
29264     // FIXME: it might make sense to use a locked operation here but on a
29265     // different cache-line to prevent cache-line bouncing. In practice it
29266     // is probably a small win, and x86 processors without mfence are rare
29267     // enough that we do not bother.
29268     return nullptr;
29269 
29270   Function *MFence =
29271       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29272   Builder.CreateCall(MFence, {});
29273 
29274   // Finally we can emit the atomic load.
29275   LoadInst *Loaded = Builder.CreateAlignedLoad(
29276       AI->getType(), AI->getPointerOperand(), AI->getAlign());
29277   Loaded->setAtomic(Order, SSID);
29278   AI->replaceAllUsesWith(Loaded);
29279   AI->eraseFromParent();
29280   return Loaded;
29281 }
29282 
29283 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29284   if (!SI.isUnordered())
29285     return false;
29286   return ExperimentalUnorderedISEL;
29287 }
29288 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29289   if (!LI.isUnordered())
29290     return false;
29291   return ExperimentalUnorderedISEL;
29292 }
29293 
29294 
29295 /// Emit a locked operation on a stack location which does not change any
29296 /// memory location, but does involve a lock prefix.  Location is chosen to be
29297 /// a) very likely accessed only by a single thread to minimize cache traffic,
29298 /// and b) definitely dereferenceable.  Returns the new Chain result.
29299 static SDValue emitLockedStackOp(SelectionDAG &DAG,
29300                                  const X86Subtarget &Subtarget, SDValue Chain,
29301                                  const SDLoc &DL) {
29302   // Implementation notes:
29303   // 1) LOCK prefix creates a full read/write reordering barrier for memory
29304   // operations issued by the current processor.  As such, the location
29305   // referenced is not relevant for the ordering properties of the instruction.
29306   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29307   // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
29308   // 2) Using an immediate operand appears to be the best encoding choice
29309   // here since it doesn't require an extra register.
29310   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29311   // is small enough it might just be measurement noise.)
29312   // 4) When choosing offsets, there are several contributing factors:
29313   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
29314   //      line aligned stack object to improve this case.)
29315   //   b) To minimize our chances of introducing a false dependence, we prefer
29316   //      to offset the stack usage from TOS slightly.
29317   //   c) To minimize concerns about cross thread stack usage - in particular,
29318   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29319   //      captures state in the TOS frame and accesses it from many threads -
29320   //      we want to use an offset such that the offset is in a distinct cache
29321   //      line from the TOS frame.
29322   //
29323   // For a general discussion of the tradeoffs and benchmark results, see:
29324   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29325 
29326   auto &MF = DAG.getMachineFunction();
29327   auto &TFL = *Subtarget.getFrameLowering();
29328   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29329 
29330   if (Subtarget.is64Bit()) {
29331     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29332     SDValue Ops[] = {
29333       DAG.getRegister(X86::RSP, MVT::i64),                  // Base
29334       DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
29335       DAG.getRegister(0, MVT::i64),                         // Index
29336       DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
29337       DAG.getRegister(0, MVT::i16),                         // Segment.
29338       Zero,
29339       Chain};
29340     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29341                                      MVT::Other, Ops);
29342     return SDValue(Res, 1);
29343   }
29344 
29345   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29346   SDValue Ops[] = {
29347     DAG.getRegister(X86::ESP, MVT::i32),            // Base
29348     DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
29349     DAG.getRegister(0, MVT::i32),                   // Index
29350     DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
29351     DAG.getRegister(0, MVT::i16),                   // Segment.
29352     Zero,
29353     Chain
29354   };
29355   SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29356                                    MVT::Other, Ops);
29357   return SDValue(Res, 1);
29358 }
29359 
29360 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29361                                  SelectionDAG &DAG) {
29362   SDLoc dl(Op);
29363   AtomicOrdering FenceOrdering =
29364       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29365   SyncScope::ID FenceSSID =
29366       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29367 
29368   // The only fence that needs an instruction is a sequentially-consistent
29369   // cross-thread fence.
29370   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29371       FenceSSID == SyncScope::System) {
29372     if (Subtarget.hasMFence())
29373       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29374 
29375     SDValue Chain = Op.getOperand(0);
29376     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29377   }
29378 
29379   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29380   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29381 }
29382 
29383 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29384                              SelectionDAG &DAG) {
29385   MVT T = Op.getSimpleValueType();
29386   SDLoc DL(Op);
29387   unsigned Reg = 0;
29388   unsigned size = 0;
29389   switch(T.SimpleTy) {
29390   default: llvm_unreachable("Invalid value type!");
29391   case MVT::i8:  Reg = X86::AL;  size = 1; break;
29392   case MVT::i16: Reg = X86::AX;  size = 2; break;
29393   case MVT::i32: Reg = X86::EAX; size = 4; break;
29394   case MVT::i64:
29395     assert(Subtarget.is64Bit() && "Node not type legal!");
29396     Reg = X86::RAX; size = 8;
29397     break;
29398   }
29399   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29400                                   Op.getOperand(2), SDValue());
29401   SDValue Ops[] = { cpIn.getValue(0),
29402                     Op.getOperand(1),
29403                     Op.getOperand(3),
29404                     DAG.getTargetConstant(size, DL, MVT::i8),
29405                     cpIn.getValue(1) };
29406   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29407   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29408   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29409                                            Ops, T, MMO);
29410 
29411   SDValue cpOut =
29412     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29413   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29414                                       MVT::i32, cpOut.getValue(2));
29415   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29416 
29417   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29418                      cpOut, Success, EFLAGS.getValue(1));
29419 }
29420 
29421 // Create MOVMSKB, taking into account whether we need to split for AVX1.
29422 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29423                            const X86Subtarget &Subtarget) {
29424   MVT InVT = V.getSimpleValueType();
29425 
29426   if (InVT == MVT::v64i8) {
29427     SDValue Lo, Hi;
29428     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29429     Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29430     Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29431     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29432     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29433     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29434                      DAG.getConstant(32, DL, MVT::i8));
29435     return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29436   }
29437   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29438     SDValue Lo, Hi;
29439     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29440     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29441     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29442     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29443                      DAG.getConstant(16, DL, MVT::i8));
29444     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29445   }
29446 
29447   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29448 }
29449 
29450 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29451                             SelectionDAG &DAG) {
29452   SDValue Src = Op.getOperand(0);
29453   MVT SrcVT = Src.getSimpleValueType();
29454   MVT DstVT = Op.getSimpleValueType();
29455 
29456   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29457   // half to v32i1 and concatenating the result.
29458   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29459     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
29460     assert(Subtarget.hasBWI() && "Expected BWI target");
29461     SDLoc dl(Op);
29462     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29463                              DAG.getIntPtrConstant(0, dl));
29464     Lo = DAG.getBitcast(MVT::v32i1, Lo);
29465     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29466                              DAG.getIntPtrConstant(1, dl));
29467     Hi = DAG.getBitcast(MVT::v32i1, Hi);
29468     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29469   }
29470 
29471   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29472   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29473     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
29474     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29475     SDLoc DL(Op);
29476     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29477     V = getPMOVMSKB(DL, V, DAG, Subtarget);
29478     return DAG.getZExtOrTrunc(V, DL, DstVT);
29479   }
29480 
29481   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
29482           SrcVT == MVT::i64) && "Unexpected VT!");
29483 
29484   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
29485   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29486       !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29487     // This conversion needs to be expanded.
29488     return SDValue();
29489 
29490   SDLoc dl(Op);
29491   if (SrcVT.isVector()) {
29492     // Widen the vector in input in the case of MVT::v2i32.
29493     // Example: from MVT::v2i32 to MVT::v4i32.
29494     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29495                                  SrcVT.getVectorNumElements() * 2);
29496     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29497                       DAG.getUNDEF(SrcVT));
29498   } else {
29499     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
29500            "Unexpected source type in LowerBITCAST");
29501     Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29502   }
29503 
29504   MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29505   Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29506 
29507   if (DstVT == MVT::x86mmx)
29508     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29509 
29510   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29511                      DAG.getIntPtrConstant(0, dl));
29512 }
29513 
29514 /// Compute the horizontal sum of bytes in V for the elements of VT.
29515 ///
29516 /// Requires V to be a byte vector and VT to be an integer vector type with
29517 /// wider elements than V's type. The width of the elements of VT determines
29518 /// how many bytes of V are summed horizontally to produce each element of the
29519 /// result.
29520 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29521                                       const X86Subtarget &Subtarget,
29522                                       SelectionDAG &DAG) {
29523   SDLoc DL(V);
29524   MVT ByteVecVT = V.getSimpleValueType();
29525   MVT EltVT = VT.getVectorElementType();
29526   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
29527          "Expected value to have byte element type.");
29528   assert(EltVT != MVT::i8 &&
29529          "Horizontal byte sum only makes sense for wider elements!");
29530   unsigned VecSize = VT.getSizeInBits();
29531   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
29532 
29533   // PSADBW instruction horizontally add all bytes and leave the result in i64
29534   // chunks, thus directly computes the pop count for v2i64 and v4i64.
29535   if (EltVT == MVT::i64) {
29536     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29537     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29538     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29539     return DAG.getBitcast(VT, V);
29540   }
29541 
29542   if (EltVT == MVT::i32) {
29543     // We unpack the low half and high half into i32s interleaved with zeros so
29544     // that we can use PSADBW to horizontally sum them. The most useful part of
29545     // this is that it lines up the results of two PSADBW instructions to be
29546     // two v2i64 vectors which concatenated are the 4 population counts. We can
29547     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29548     SDValue Zeros = DAG.getConstant(0, DL, VT);
29549     SDValue V32 = DAG.getBitcast(VT, V);
29550     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29551     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29552 
29553     // Do the horizontal sums into two v2i64s.
29554     Zeros = DAG.getConstant(0, DL, ByteVecVT);
29555     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29556     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29557                       DAG.getBitcast(ByteVecVT, Low), Zeros);
29558     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29559                        DAG.getBitcast(ByteVecVT, High), Zeros);
29560 
29561     // Merge them together.
29562     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29563     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29564                     DAG.getBitcast(ShortVecVT, Low),
29565                     DAG.getBitcast(ShortVecVT, High));
29566 
29567     return DAG.getBitcast(VT, V);
29568   }
29569 
29570   // The only element type left is i16.
29571   assert(EltVT == MVT::i16 && "Unknown how to handle type");
29572 
29573   // To obtain pop count for each i16 element starting from the pop count for
29574   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29575   // right by 8. It is important to shift as i16s as i8 vector shift isn't
29576   // directly supported.
29577   SDValue ShifterV = DAG.getConstant(8, DL, VT);
29578   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29579   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29580                   DAG.getBitcast(ByteVecVT, V));
29581   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29582 }
29583 
29584 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29585                                         const X86Subtarget &Subtarget,
29586                                         SelectionDAG &DAG) {
29587   MVT VT = Op.getSimpleValueType();
29588   MVT EltVT = VT.getVectorElementType();
29589   int NumElts = VT.getVectorNumElements();
29590   (void)EltVT;
29591   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
29592 
29593   // Implement a lookup table in register by using an algorithm based on:
29594   // http://wm.ite.pl/articles/sse-popcount.html
29595   //
29596   // The general idea is that every lower byte nibble in the input vector is an
29597   // index into a in-register pre-computed pop count table. We then split up the
29598   // input vector in two new ones: (1) a vector with only the shifted-right
29599   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29600   // masked out higher ones) for each byte. PSHUFB is used separately with both
29601   // to index the in-register table. Next, both are added and the result is a
29602   // i8 vector where each element contains the pop count for input byte.
29603   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29604                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29605                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29606                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29607 
29608   SmallVector<SDValue, 64> LUTVec;
29609   for (int i = 0; i < NumElts; ++i)
29610     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29611   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29612   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29613 
29614   // High nibbles
29615   SDValue FourV = DAG.getConstant(4, DL, VT);
29616   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29617 
29618   // Low nibbles
29619   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29620 
29621   // The input vector is used as the shuffle mask that index elements into the
29622   // LUT. After counting low and high nibbles, add the vector to obtain the
29623   // final pop count per i8 element.
29624   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29625   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29626   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29627 }
29628 
29629 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29630 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
29631 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29632                                 SelectionDAG &DAG) {
29633   MVT VT = Op.getSimpleValueType();
29634   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
29635          "Unknown CTPOP type to handle");
29636   SDLoc DL(Op.getNode());
29637   SDValue Op0 = Op.getOperand(0);
29638 
29639   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29640   if (Subtarget.hasVPOPCNTDQ()) {
29641     unsigned NumElems = VT.getVectorNumElements();
29642     assert((VT.getVectorElementType() == MVT::i8 ||
29643             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
29644     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29645       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29646       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29647       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29648       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29649     }
29650   }
29651 
29652   // Decompose 256-bit ops into smaller 128-bit ops.
29653   if (VT.is256BitVector() && !Subtarget.hasInt256())
29654     return splitVectorIntUnary(Op, DAG);
29655 
29656   // Decompose 512-bit ops into smaller 256-bit ops.
29657   if (VT.is512BitVector() && !Subtarget.hasBWI())
29658     return splitVectorIntUnary(Op, DAG);
29659 
29660   // For element types greater than i8, do vXi8 pop counts and a bytesum.
29661   if (VT.getScalarType() != MVT::i8) {
29662     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29663     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29664     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29665     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29666   }
29667 
29668   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29669   if (!Subtarget.hasSSSE3())
29670     return SDValue();
29671 
29672   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29673 }
29674 
29675 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29676                           SelectionDAG &DAG) {
29677   assert(Op.getSimpleValueType().isVector() &&
29678          "We only do custom lowering for vector population count.");
29679   return LowerVectorCTPOP(Op, Subtarget, DAG);
29680 }
29681 
29682 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29683   MVT VT = Op.getSimpleValueType();
29684   SDValue In = Op.getOperand(0);
29685   SDLoc DL(Op);
29686 
29687   // For scalars, its still beneficial to transfer to/from the SIMD unit to
29688   // perform the BITREVERSE.
29689   if (!VT.isVector()) {
29690     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29691     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29692     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29693     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29694                        DAG.getIntPtrConstant(0, DL));
29695   }
29696 
29697   int NumElts = VT.getVectorNumElements();
29698   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29699 
29700   // Decompose 256-bit ops into smaller 128-bit ops.
29701   if (VT.is256BitVector())
29702     return splitVectorIntUnary(Op, DAG);
29703 
29704   assert(VT.is128BitVector() &&
29705          "Only 128-bit vector bitreverse lowering supported.");
29706 
29707   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29708   // perform the BSWAP in the shuffle.
29709   // Its best to shuffle using the second operand as this will implicitly allow
29710   // memory folding for multiple vectors.
29711   SmallVector<SDValue, 16> MaskElts;
29712   for (int i = 0; i != NumElts; ++i) {
29713     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29714       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29715       int PermuteByte = SourceByte | (2 << 5);
29716       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29717     }
29718   }
29719 
29720   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29721   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29722   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29723                     Res, Mask);
29724   return DAG.getBitcast(VT, Res);
29725 }
29726 
29727 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29728                                SelectionDAG &DAG) {
29729   MVT VT = Op.getSimpleValueType();
29730 
29731   if (Subtarget.hasXOP() && !VT.is512BitVector())
29732     return LowerBITREVERSE_XOP(Op, DAG);
29733 
29734   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
29735 
29736   SDValue In = Op.getOperand(0);
29737   SDLoc DL(Op);
29738 
29739   assert(VT.getScalarType() == MVT::i8 &&
29740          "Only byte vector BITREVERSE supported");
29741 
29742   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29743   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29744     return splitVectorIntUnary(Op, DAG);
29745 
29746   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29747   if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29748     return splitVectorIntUnary(Op, DAG);
29749 
29750   unsigned NumElts = VT.getVectorNumElements();
29751 
29752   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29753   if (Subtarget.hasGFNI()) {
29754     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29755     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29756     Matrix = DAG.getBitcast(VT, Matrix);
29757     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29758                        DAG.getTargetConstant(0, DL, MVT::i8));
29759   }
29760 
29761   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29762   // two nibbles and a PSHUFB lookup to find the bitreverse of each
29763   // 0-15 value (moved to the other nibble).
29764   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29765   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29766   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29767 
29768   const int LoLUT[16] = {
29769       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29770       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29771       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29772       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29773   const int HiLUT[16] = {
29774       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29775       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29776       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29777       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29778 
29779   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29780   for (unsigned i = 0; i < NumElts; ++i) {
29781     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29782     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29783   }
29784 
29785   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29786   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29787   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29788   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29789   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29790 }
29791 
29792 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29793                            SelectionDAG &DAG) {
29794   SDLoc DL(Op);
29795   SDValue X = Op.getOperand(0);
29796   MVT VT = Op.getSimpleValueType();
29797 
29798   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29799   if (VT == MVT::i8 ||
29800       DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29801     X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29802     SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29803                                 DAG.getConstant(0, DL, MVT::i8));
29804     // Copy the inverse of the parity flag into a register with setcc.
29805     SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29806     // Extend to the original type.
29807     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29808   }
29809 
29810   if (VT == MVT::i64) {
29811     // Xor the high and low 16-bits together using a 32-bit operation.
29812     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29813                              DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29814                                          DAG.getConstant(32, DL, MVT::i8)));
29815     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29816     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29817   }
29818 
29819   if (VT != MVT::i16) {
29820     // Xor the high and low 16-bits together using a 32-bit operation.
29821     SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29822                                DAG.getConstant(16, DL, MVT::i8));
29823     X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29824   } else {
29825     // If the input is 16-bits, we need to extend to use an i32 shift below.
29826     X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29827   }
29828 
29829   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29830   // This should allow an h-reg to be used to save a shift.
29831   SDValue Hi = DAG.getNode(
29832       ISD::TRUNCATE, DL, MVT::i8,
29833       DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29834   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29835   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29836   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29837 
29838   // Copy the inverse of the parity flag into a register with setcc.
29839   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29840   // Extend to the original type.
29841   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29842 }
29843 
29844 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29845                                         const X86Subtarget &Subtarget) {
29846   unsigned NewOpc = 0;
29847   switch (N->getOpcode()) {
29848   case ISD::ATOMIC_LOAD_ADD:
29849     NewOpc = X86ISD::LADD;
29850     break;
29851   case ISD::ATOMIC_LOAD_SUB:
29852     NewOpc = X86ISD::LSUB;
29853     break;
29854   case ISD::ATOMIC_LOAD_OR:
29855     NewOpc = X86ISD::LOR;
29856     break;
29857   case ISD::ATOMIC_LOAD_XOR:
29858     NewOpc = X86ISD::LXOR;
29859     break;
29860   case ISD::ATOMIC_LOAD_AND:
29861     NewOpc = X86ISD::LAND;
29862     break;
29863   default:
29864     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
29865   }
29866 
29867   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29868 
29869   return DAG.getMemIntrinsicNode(
29870       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29871       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29872       /*MemVT=*/N->getSimpleValueType(0), MMO);
29873 }
29874 
29875 /// Lower atomic_load_ops into LOCK-prefixed operations.
29876 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29877                                 const X86Subtarget &Subtarget) {
29878   AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29879   SDValue Chain = N->getOperand(0);
29880   SDValue LHS = N->getOperand(1);
29881   SDValue RHS = N->getOperand(2);
29882   unsigned Opc = N->getOpcode();
29883   MVT VT = N->getSimpleValueType(0);
29884   SDLoc DL(N);
29885 
29886   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29887   // can only be lowered when the result is unused.  They should have already
29888   // been transformed into a cmpxchg loop in AtomicExpand.
29889   if (N->hasAnyUseOfValue(0)) {
29890     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29891     // select LXADD if LOCK_SUB can't be selected.
29892     if (Opc == ISD::ATOMIC_LOAD_SUB) {
29893       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29894       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29895                            RHS, AN->getMemOperand());
29896     }
29897     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
29898            "Used AtomicRMW ops other than Add should have been expanded!");
29899     return N;
29900   }
29901 
29902   // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29903   // The core idea here is that since the memory location isn't actually
29904   // changing, all we need is a lowering for the *ordering* impacts of the
29905   // atomicrmw.  As such, we can chose a different operation and memory
29906   // location to minimize impact on other code.
29907   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29908     // On X86, the only ordering which actually requires an instruction is
29909     // seq_cst which isn't SingleThread, everything just needs to be preserved
29910     // during codegen and then dropped. Note that we expect (but don't assume),
29911     // that orderings other than seq_cst and acq_rel have been canonicalized to
29912     // a store or load.
29913     if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
29914         AN->getSyncScopeID() == SyncScope::System) {
29915       // Prefer a locked operation against a stack location to minimize cache
29916       // traffic.  This assumes that stack locations are very likely to be
29917       // accessed only by the owning thread.
29918       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29919       assert(!N->hasAnyUseOfValue(0));
29920       // NOTE: The getUNDEF is needed to give something for the unused result 0.
29921       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29922                          DAG.getUNDEF(VT), NewChain);
29923     }
29924     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29925     SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29926     assert(!N->hasAnyUseOfValue(0));
29927     // NOTE: The getUNDEF is needed to give something for the unused result 0.
29928     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29929                        DAG.getUNDEF(VT), NewChain);
29930   }
29931 
29932   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29933   // RAUW the chain, but don't worry about the result, as it's unused.
29934   assert(!N->hasAnyUseOfValue(0));
29935   // NOTE: The getUNDEF is needed to give something for the unused result 0.
29936   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29937                      DAG.getUNDEF(VT), LockOp.getValue(1));
29938 }
29939 
29940 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29941                                  const X86Subtarget &Subtarget) {
29942   auto *Node = cast<AtomicSDNode>(Op.getNode());
29943   SDLoc dl(Node);
29944   EVT VT = Node->getMemoryVT();
29945 
29946   bool IsSeqCst =
29947       Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
29948   bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29949 
29950   // If this store is not sequentially consistent and the type is legal
29951   // we can just keep it.
29952   if (!IsSeqCst && IsTypeLegal)
29953     return Op;
29954 
29955   if (VT == MVT::i64 && !IsTypeLegal) {
29956     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29957     // is enabled.
29958     bool NoImplicitFloatOps =
29959         DAG.getMachineFunction().getFunction().hasFnAttribute(
29960             Attribute::NoImplicitFloat);
29961     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29962       SDValue Chain;
29963       if (Subtarget.hasSSE1()) {
29964         SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29965                                        Node->getOperand(2));
29966         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29967         SclToVec = DAG.getBitcast(StVT, SclToVec);
29968         SDVTList Tys = DAG.getVTList(MVT::Other);
29969         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29970         Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29971                                         MVT::i64, Node->getMemOperand());
29972       } else if (Subtarget.hasX87()) {
29973         // First load this into an 80-bit X87 register using a stack temporary.
29974         // This will put the whole integer into the significand.
29975         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29976         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29977         MachinePointerInfo MPI =
29978             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29979         Chain =
29980             DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29981                          MPI, MaybeAlign(), MachineMemOperand::MOStore);
29982         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29983         SDValue LdOps[] = {Chain, StackPtr};
29984         SDValue Value =
29985             DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29986                                     /*Align*/ None, MachineMemOperand::MOLoad);
29987         Chain = Value.getValue(1);
29988 
29989         // Now use an FIST to do the atomic store.
29990         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29991         Chain =
29992             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29993                                     StoreOps, MVT::i64, Node->getMemOperand());
29994       }
29995 
29996       if (Chain) {
29997         // If this is a sequentially consistent store, also emit an appropriate
29998         // barrier.
29999         if (IsSeqCst)
30000           Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30001 
30002         return Chain;
30003       }
30004     }
30005   }
30006 
30007   // Convert seq_cst store -> xchg
30008   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30009   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30010   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30011                                Node->getMemoryVT(),
30012                                Node->getOperand(0),
30013                                Node->getOperand(1), Node->getOperand(2),
30014                                Node->getMemOperand());
30015   return Swap.getValue(1);
30016 }
30017 
30018 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30019   SDNode *N = Op.getNode();
30020   MVT VT = N->getSimpleValueType(0);
30021   unsigned Opc = Op.getOpcode();
30022 
30023   // Let legalize expand this if it isn't a legal type yet.
30024   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30025     return SDValue();
30026 
30027   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30028   SDLoc DL(N);
30029 
30030   // Set the carry flag.
30031   SDValue Carry = Op.getOperand(2);
30032   EVT CarryVT = Carry.getValueType();
30033   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30034                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
30035 
30036   bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30037   SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30038                             Op.getOperand(0), Op.getOperand(1),
30039                             Carry.getValue(1));
30040 
30041   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30042   SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30043                            Sum.getValue(1), DL, DAG);
30044   if (N->getValueType(1) == MVT::i1)
30045     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30046 
30047   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30048 }
30049 
30050 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30051                             SelectionDAG &DAG) {
30052   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
30053 
30054   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30055   // which returns the values as { float, float } (in XMM0) or
30056   // { double, double } (which is returned in XMM0, XMM1).
30057   SDLoc dl(Op);
30058   SDValue Arg = Op.getOperand(0);
30059   EVT ArgVT = Arg.getValueType();
30060   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30061 
30062   TargetLowering::ArgListTy Args;
30063   TargetLowering::ArgListEntry Entry;
30064 
30065   Entry.Node = Arg;
30066   Entry.Ty = ArgTy;
30067   Entry.IsSExt = false;
30068   Entry.IsZExt = false;
30069   Args.push_back(Entry);
30070 
30071   bool isF64 = ArgVT == MVT::f64;
30072   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30073   // the small struct {f32, f32} is returned in (eax, edx). For f64,
30074   // the results are returned via SRet in memory.
30075   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30076   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30077   const char *LibcallName = TLI.getLibcallName(LC);
30078   SDValue Callee =
30079       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30080 
30081   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30082                       : (Type *)FixedVectorType::get(ArgTy, 4);
30083 
30084   TargetLowering::CallLoweringInfo CLI(DAG);
30085   CLI.setDebugLoc(dl)
30086       .setChain(DAG.getEntryNode())
30087       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30088 
30089   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30090 
30091   if (isF64)
30092     // Returned in xmm0 and xmm1.
30093     return CallResult.first;
30094 
30095   // Returned in bits 0:31 and 32:64 xmm0.
30096   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30097                                CallResult.first, DAG.getIntPtrConstant(0, dl));
30098   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30099                                CallResult.first, DAG.getIntPtrConstant(1, dl));
30100   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30101   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30102 }
30103 
30104 /// Widen a vector input to a vector of NVT.  The
30105 /// input vector must have the same element type as NVT.
30106 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30107                             bool FillWithZeroes = false) {
30108   // Check if InOp already has the right width.
30109   MVT InVT = InOp.getSimpleValueType();
30110   if (InVT == NVT)
30111     return InOp;
30112 
30113   if (InOp.isUndef())
30114     return DAG.getUNDEF(NVT);
30115 
30116   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
30117          "input and widen element type must match");
30118 
30119   unsigned InNumElts = InVT.getVectorNumElements();
30120   unsigned WidenNumElts = NVT.getVectorNumElements();
30121   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
30122          "Unexpected request for vector widening");
30123 
30124   SDLoc dl(InOp);
30125   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30126       InOp.getNumOperands() == 2) {
30127     SDValue N1 = InOp.getOperand(1);
30128     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30129         N1.isUndef()) {
30130       InOp = InOp.getOperand(0);
30131       InVT = InOp.getSimpleValueType();
30132       InNumElts = InVT.getVectorNumElements();
30133     }
30134   }
30135   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30136       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30137     SmallVector<SDValue, 16> Ops;
30138     for (unsigned i = 0; i < InNumElts; ++i)
30139       Ops.push_back(InOp.getOperand(i));
30140 
30141     EVT EltVT = InOp.getOperand(0).getValueType();
30142 
30143     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30144       DAG.getUNDEF(EltVT);
30145     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30146       Ops.push_back(FillVal);
30147     return DAG.getBuildVector(NVT, dl, Ops);
30148   }
30149   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30150     DAG.getUNDEF(NVT);
30151   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30152                      InOp, DAG.getIntPtrConstant(0, dl));
30153 }
30154 
30155 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30156                              SelectionDAG &DAG) {
30157   assert(Subtarget.hasAVX512() &&
30158          "MGATHER/MSCATTER are supported on AVX-512 arch only");
30159 
30160   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30161   SDValue Src = N->getValue();
30162   MVT VT = Src.getSimpleValueType();
30163   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
30164   SDLoc dl(Op);
30165 
30166   SDValue Scale = N->getScale();
30167   SDValue Index = N->getIndex();
30168   SDValue Mask = N->getMask();
30169   SDValue Chain = N->getChain();
30170   SDValue BasePtr = N->getBasePtr();
30171 
30172   if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30173     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
30174     // If the index is v2i64 and we have VLX we can use xmm for data and index.
30175     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30176       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30177       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30178       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30179       SDVTList VTs = DAG.getVTList(MVT::Other);
30180       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30181       return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30182                                      N->getMemoryVT(), N->getMemOperand());
30183     }
30184     return SDValue();
30185   }
30186 
30187   MVT IndexVT = Index.getSimpleValueType();
30188 
30189   // If the index is v2i32, we're being called by type legalization and we
30190   // should just let the default handling take care of it.
30191   if (IndexVT == MVT::v2i32)
30192     return SDValue();
30193 
30194   // If we don't have VLX and neither the passthru or index is 512-bits, we
30195   // need to widen until one is.
30196   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30197       !Index.getSimpleValueType().is512BitVector()) {
30198     // Determine how much we need to widen by to get a 512-bit type.
30199     unsigned Factor = std::min(512/VT.getSizeInBits(),
30200                                512/IndexVT.getSizeInBits());
30201     unsigned NumElts = VT.getVectorNumElements() * Factor;
30202 
30203     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30204     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30205     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30206 
30207     Src = ExtendToType(Src, VT, DAG);
30208     Index = ExtendToType(Index, IndexVT, DAG);
30209     Mask = ExtendToType(Mask, MaskVT, DAG, true);
30210   }
30211 
30212   SDVTList VTs = DAG.getVTList(MVT::Other);
30213   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30214   return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30215                                  N->getMemoryVT(), N->getMemOperand());
30216 }
30217 
30218 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30219                           SelectionDAG &DAG) {
30220 
30221   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30222   MVT VT = Op.getSimpleValueType();
30223   MVT ScalarVT = VT.getScalarType();
30224   SDValue Mask = N->getMask();
30225   MVT MaskVT = Mask.getSimpleValueType();
30226   SDValue PassThru = N->getPassThru();
30227   SDLoc dl(Op);
30228 
30229   // Handle AVX masked loads which don't support passthru other than 0.
30230   if (MaskVT.getVectorElementType() != MVT::i1) {
30231     // We also allow undef in the isel pattern.
30232     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30233       return Op;
30234 
30235     SDValue NewLoad = DAG.getMaskedLoad(
30236         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30237         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30238         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30239         N->isExpandingLoad());
30240     // Emit a blend.
30241     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30242     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30243   }
30244 
30245   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
30246          "Expanding masked load is supported on AVX-512 target only!");
30247 
30248   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
30249          "Expanding masked load is supported for 32 and 64-bit types only!");
30250 
30251   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30252          "Cannot lower masked load op.");
30253 
30254   assert((ScalarVT.getSizeInBits() >= 32 ||
30255           (Subtarget.hasBWI() &&
30256               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30257          "Unsupported masked load op.");
30258 
30259   // This operation is legal for targets with VLX, but without
30260   // VLX the vector should be widened to 512 bit
30261   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30262   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30263   PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30264 
30265   // Mask element has to be i1.
30266   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30267          "Unexpected mask type");
30268 
30269   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30270 
30271   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30272   SDValue NewLoad = DAG.getMaskedLoad(
30273       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30274       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30275       N->getExtensionType(), N->isExpandingLoad());
30276 
30277   SDValue Extract =
30278       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30279                   DAG.getIntPtrConstant(0, dl));
30280   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30281   return DAG.getMergeValues(RetOps, dl);
30282 }
30283 
30284 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30285                            SelectionDAG &DAG) {
30286   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30287   SDValue DataToStore = N->getValue();
30288   MVT VT = DataToStore.getSimpleValueType();
30289   MVT ScalarVT = VT.getScalarType();
30290   SDValue Mask = N->getMask();
30291   SDLoc dl(Op);
30292 
30293   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
30294          "Expanding masked load is supported on AVX-512 target only!");
30295 
30296   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
30297          "Expanding masked load is supported for 32 and 64-bit types only!");
30298 
30299   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30300          "Cannot lower masked store op.");
30301 
30302   assert((ScalarVT.getSizeInBits() >= 32 ||
30303           (Subtarget.hasBWI() &&
30304               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
30305           "Unsupported masked store op.");
30306 
30307   // This operation is legal for targets with VLX, but without
30308   // VLX the vector should be widened to 512 bit
30309   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30310   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30311 
30312   // Mask element has to be i1.
30313   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
30314          "Unexpected mask type");
30315 
30316   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30317 
30318   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30319   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30320   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30321                             N->getOffset(), Mask, N->getMemoryVT(),
30322                             N->getMemOperand(), N->getAddressingMode(),
30323                             N->isTruncatingStore(), N->isCompressingStore());
30324 }
30325 
30326 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30327                             SelectionDAG &DAG) {
30328   assert(Subtarget.hasAVX2() &&
30329          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
30330 
30331   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30332   SDLoc dl(Op);
30333   MVT VT = Op.getSimpleValueType();
30334   SDValue Index = N->getIndex();
30335   SDValue Mask = N->getMask();
30336   SDValue PassThru = N->getPassThru();
30337   MVT IndexVT = Index.getSimpleValueType();
30338 
30339   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
30340 
30341   // If the index is v2i32, we're being called by type legalization.
30342   if (IndexVT == MVT::v2i32)
30343     return SDValue();
30344 
30345   // If we don't have VLX and neither the passthru or index is 512-bits, we
30346   // need to widen until one is.
30347   MVT OrigVT = VT;
30348   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30349       !IndexVT.is512BitVector()) {
30350     // Determine how much we need to widen by to get a 512-bit type.
30351     unsigned Factor = std::min(512/VT.getSizeInBits(),
30352                                512/IndexVT.getSizeInBits());
30353 
30354     unsigned NumElts = VT.getVectorNumElements() * Factor;
30355 
30356     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30357     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30358     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30359 
30360     PassThru = ExtendToType(PassThru, VT, DAG);
30361     Index = ExtendToType(Index, IndexVT, DAG);
30362     Mask = ExtendToType(Mask, MaskVT, DAG, true);
30363   }
30364 
30365   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30366                     N->getScale() };
30367   SDValue NewGather = DAG.getMemIntrinsicNode(
30368       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30369       N->getMemOperand());
30370   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30371                                 NewGather, DAG.getIntPtrConstant(0, dl));
30372   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30373 }
30374 
30375 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30376   SDLoc dl(Op);
30377   SDValue Src = Op.getOperand(0);
30378   MVT DstVT = Op.getSimpleValueType();
30379 
30380   AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30381   unsigned SrcAS = N->getSrcAddressSpace();
30382 
30383   assert(SrcAS != N->getDestAddressSpace() &&
30384          "addrspacecast must be between different address spaces");
30385 
30386   if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30387     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30388   } else if (DstVT == MVT::i64) {
30389     Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30390   } else if (DstVT == MVT::i32) {
30391     Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30392   } else {
30393     report_fatal_error("Bad address space in addrspacecast");
30394   }
30395   return Op;
30396 }
30397 
30398 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30399                                               SelectionDAG &DAG) const {
30400   // TODO: Eventually, the lowering of these nodes should be informed by or
30401   // deferred to the GC strategy for the function in which they appear. For
30402   // now, however, they must be lowered to something. Since they are logically
30403   // no-ops in the case of a null GC strategy (or a GC strategy which does not
30404   // require special handling for these nodes), lower them as literal NOOPs for
30405   // the time being.
30406   SmallVector<SDValue, 2> Ops;
30407 
30408   Ops.push_back(Op.getOperand(0));
30409   if (Op->getGluedNode())
30410     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30411 
30412   SDLoc OpDL(Op);
30413   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30414   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30415 
30416   return NOOP;
30417 }
30418 
30419 // Custom split CVTPS2PH with wide types.
30420 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30421   SDLoc dl(Op);
30422   EVT VT = Op.getValueType();
30423   SDValue Lo, Hi;
30424   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30425   EVT LoVT, HiVT;
30426   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30427   SDValue RC = Op.getOperand(1);
30428   Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30429   Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30430   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30431 }
30432 
30433 /// Provide custom lowering hooks for some operations.
30434 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30435   switch (Op.getOpcode()) {
30436   default: llvm_unreachable("Should not custom lower this!");
30437   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30438   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30439     return LowerCMP_SWAP(Op, Subtarget, DAG);
30440   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
30441   case ISD::ATOMIC_LOAD_ADD:
30442   case ISD::ATOMIC_LOAD_SUB:
30443   case ISD::ATOMIC_LOAD_OR:
30444   case ISD::ATOMIC_LOAD_XOR:
30445   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
30446   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
30447   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
30448   case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
30449   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
30450   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30451   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30452   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
30453   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30454   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
30455   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30456   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30457   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30458   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
30459   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
30460   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
30461   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
30462   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
30463   case ISD::SHL_PARTS:
30464   case ISD::SRA_PARTS:
30465   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
30466   case ISD::FSHL:
30467   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
30468   case ISD::STRICT_SINT_TO_FP:
30469   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
30470   case ISD::STRICT_UINT_TO_FP:
30471   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
30472   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
30473   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
30474   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30475   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
30476   case ISD::ZERO_EXTEND_VECTOR_INREG:
30477   case ISD::SIGN_EXTEND_VECTOR_INREG:
30478     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30479   case ISD::FP_TO_SINT:
30480   case ISD::STRICT_FP_TO_SINT:
30481   case ISD::FP_TO_UINT:
30482   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
30483   case ISD::FP_TO_SINT_SAT:
30484   case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
30485   case ISD::FP_EXTEND:
30486   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
30487   case ISD::FP_ROUND:
30488   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
30489   case ISD::FP16_TO_FP:
30490   case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
30491   case ISD::FP_TO_FP16:
30492   case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
30493   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
30494   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
30495   case ISD::FADD:
30496   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
30497   case ISD::FROUND:             return LowerFROUND(Op, DAG);
30498   case ISD::FABS:
30499   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
30500   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
30501   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
30502   case ISD::LRINT:
30503   case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
30504   case ISD::SETCC:
30505   case ISD::STRICT_FSETCC:
30506   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
30507   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
30508   case ISD::SELECT:             return LowerSELECT(Op, DAG);
30509   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
30510   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
30511   case ISD::VASTART:            return LowerVASTART(Op, DAG);
30512   case ISD::VAARG:              return LowerVAARG(Op, DAG);
30513   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
30514   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
30515   case ISD::INTRINSIC_VOID:
30516   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
30517   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
30518   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
30519   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
30520   case ISD::FRAME_TO_ARGS_OFFSET:
30521                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
30522   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
30523   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
30524   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
30525   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
30526   case ISD::EH_SJLJ_SETUP_DISPATCH:
30527     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
30528   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
30529   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
30530   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
30531   case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
30532   case ISD::CTLZ:
30533   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
30534   case ISD::CTTZ:
30535   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
30536   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
30537   case ISD::MULHS:
30538   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
30539   case ISD::ROTL:
30540   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
30541   case ISD::SRA:
30542   case ISD::SRL:
30543   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
30544   case ISD::SADDO:
30545   case ISD::UADDO:
30546   case ISD::SSUBO:
30547   case ISD::USUBO:              return LowerXALUO(Op, DAG);
30548   case ISD::SMULO:
30549   case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
30550   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
30551   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
30552   case ISD::SADDO_CARRY:
30553   case ISD::SSUBO_CARRY:
30554   case ISD::ADDCARRY:
30555   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
30556   case ISD::ADD:
30557   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
30558   case ISD::UADDSAT:
30559   case ISD::SADDSAT:
30560   case ISD::USUBSAT:
30561   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
30562   case ISD::SMAX:
30563   case ISD::SMIN:
30564   case ISD::UMAX:
30565   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
30566   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
30567   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
30568   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
30569   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
30570   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
30571   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
30572   case ISD::GC_TRANSITION_START:
30573   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
30574   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
30575   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
30576   }
30577 }
30578 
30579 /// Replace a node with an illegal result type with a new node built out of
30580 /// custom code.
30581 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
30582                                            SmallVectorImpl<SDValue>&Results,
30583                                            SelectionDAG &DAG) const {
30584   SDLoc dl(N);
30585   switch (N->getOpcode()) {
30586   default:
30587 #ifndef NDEBUG
30588     dbgs() << "ReplaceNodeResults: ";
30589     N->dump(&DAG);
30590 #endif
30591     llvm_unreachable("Do not know how to custom type legalize this operation!");
30592   case X86ISD::CVTPH2PS: {
30593     EVT VT = N->getValueType(0);
30594     SDValue Lo, Hi;
30595     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30596     EVT LoVT, HiVT;
30597     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30598     Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
30599     Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
30600     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30601     Results.push_back(Res);
30602     return;
30603   }
30604   case X86ISD::STRICT_CVTPH2PS: {
30605     EVT VT = N->getValueType(0);
30606     SDValue Lo, Hi;
30607     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
30608     EVT LoVT, HiVT;
30609     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30610     Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
30611                      {N->getOperand(0), Lo});
30612     Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
30613                      {N->getOperand(0), Hi});
30614     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30615                                 Lo.getValue(1), Hi.getValue(1));
30616     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30617     Results.push_back(Res);
30618     Results.push_back(Chain);
30619     return;
30620   }
30621   case X86ISD::CVTPS2PH:
30622     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
30623     return;
30624   case ISD::CTPOP: {
30625     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
30626     // Use a v2i64 if possible.
30627     bool NoImplicitFloatOps =
30628         DAG.getMachineFunction().getFunction().hasFnAttribute(
30629             Attribute::NoImplicitFloat);
30630     if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
30631       SDValue Wide =
30632           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
30633       Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
30634       // Bit count should fit in 32-bits, extract it as that and then zero
30635       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
30636       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
30637       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
30638                          DAG.getIntPtrConstant(0, dl));
30639       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
30640       Results.push_back(Wide);
30641     }
30642     return;
30643   }
30644   case ISD::MUL: {
30645     EVT VT = N->getValueType(0);
30646     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30647            VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
30648     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
30649     // elements are needed.
30650     MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30651     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
30652     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
30653     SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
30654     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30655     unsigned NumConcats = 16 / VT.getVectorNumElements();
30656     SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30657     ConcatOps[0] = Res;
30658     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
30659     Results.push_back(Res);
30660     return;
30661   }
30662   case X86ISD::VPMADDWD:
30663   case X86ISD::AVG: {
30664     // Legalize types for X86ISD::AVG/VPMADDWD by widening.
30665     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30666 
30667     EVT VT = N->getValueType(0);
30668     EVT InVT = N->getOperand(0).getValueType();
30669     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
30670            "Expected a VT that divides into 128 bits.");
30671     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30672            "Unexpected type action!");
30673     unsigned NumConcat = 128 / InVT.getSizeInBits();
30674 
30675     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
30676                                     InVT.getVectorElementType(),
30677                                     NumConcat * InVT.getVectorNumElements());
30678     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
30679                                   VT.getVectorElementType(),
30680                                   NumConcat * VT.getVectorNumElements());
30681 
30682     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
30683     Ops[0] = N->getOperand(0);
30684     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30685     Ops[0] = N->getOperand(1);
30686     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30687 
30688     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
30689     Results.push_back(Res);
30690     return;
30691   }
30692   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
30693   case X86ISD::FMINC:
30694   case X86ISD::FMIN:
30695   case X86ISD::FMAXC:
30696   case X86ISD::FMAX: {
30697     EVT VT = N->getValueType(0);
30698     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
30699     SDValue UNDEF = DAG.getUNDEF(VT);
30700     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30701                               N->getOperand(0), UNDEF);
30702     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30703                               N->getOperand(1), UNDEF);
30704     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
30705     return;
30706   }
30707   case ISD::SDIV:
30708   case ISD::UDIV:
30709   case ISD::SREM:
30710   case ISD::UREM: {
30711     EVT VT = N->getValueType(0);
30712     if (VT.isVector()) {
30713       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30714              "Unexpected type action!");
30715       // If this RHS is a constant splat vector we can widen this and let
30716       // division/remainder by constant optimize it.
30717       // TODO: Can we do something for non-splat?
30718       APInt SplatVal;
30719       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30720         unsigned NumConcats = 128 / VT.getSizeInBits();
30721         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30722         Ops0[0] = N->getOperand(0);
30723         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30724         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30725         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30726         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30727         Results.push_back(Res);
30728       }
30729       return;
30730     }
30731 
30732     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30733     Results.push_back(V);
30734     return;
30735   }
30736   case ISD::TRUNCATE: {
30737     MVT VT = N->getSimpleValueType(0);
30738     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30739       return;
30740 
30741     // The generic legalizer will try to widen the input type to the same
30742     // number of elements as the widened result type. But this isn't always
30743     // the best thing so do some custom legalization to avoid some cases.
30744     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30745     SDValue In = N->getOperand(0);
30746     EVT InVT = In.getValueType();
30747 
30748     unsigned InBits = InVT.getSizeInBits();
30749     if (128 % InBits == 0) {
30750       // 128 bit and smaller inputs should avoid truncate all together and
30751       // just use a build_vector that will become a shuffle.
30752       // TODO: Widen and use a shuffle directly?
30753       MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30754       EVT EltVT = VT.getVectorElementType();
30755       unsigned WidenNumElts = WidenVT.getVectorNumElements();
30756       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30757       // Use the original element count so we don't do more scalar opts than
30758       // necessary.
30759       unsigned MinElts = VT.getVectorNumElements();
30760       for (unsigned i=0; i < MinElts; ++i) {
30761         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30762                                   DAG.getIntPtrConstant(i, dl));
30763         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30764       }
30765       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30766       return;
30767     }
30768     // With AVX512 there are some cases that can use a target specific
30769     // truncate node to go from 256/512 to less than 128 with zeros in the
30770     // upper elements of the 128 bit result.
30771     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30772       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30773       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30774         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30775         return;
30776       }
30777       // There's one case we can widen to 512 bits and use VTRUNC.
30778       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30779         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30780                          DAG.getUNDEF(MVT::v4i64));
30781         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30782         return;
30783       }
30784     }
30785     if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30786         getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30787         isTypeLegal(MVT::v4i64)) {
30788       // Input needs to be split and output needs to widened. Let's use two
30789       // VTRUNCs, and shuffle their results together into the wider type.
30790       SDValue Lo, Hi;
30791       std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30792 
30793       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30794       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30795       SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30796                                          { 0,  1,  2,  3, 16, 17, 18, 19,
30797                                           -1, -1, -1, -1, -1, -1, -1, -1 });
30798       Results.push_back(Res);
30799       return;
30800     }
30801 
30802     return;
30803   }
30804   case ISD::ANY_EXTEND:
30805     // Right now, only MVT::v8i8 has Custom action for an illegal type.
30806     // It's intended to custom handle the input type.
30807     assert(N->getValueType(0) == MVT::v8i8 &&
30808            "Do not know how to legalize this Node");
30809     return;
30810   case ISD::SIGN_EXTEND:
30811   case ISD::ZERO_EXTEND: {
30812     EVT VT = N->getValueType(0);
30813     SDValue In = N->getOperand(0);
30814     EVT InVT = In.getValueType();
30815     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30816         (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30817       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
30818              "Unexpected type action!");
30819       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
30820       // Custom split this so we can extend i8/i16->i32 invec. This is better
30821       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30822       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30823       // we allow the sra from the extend to i32 to be shared by the split.
30824       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30825 
30826       // Fill a vector with sign bits for each element.
30827       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30828       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30829 
30830       // Create an unpackl and unpackh to interleave the sign bits then bitcast
30831       // to v2i64.
30832       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30833                                         {0, 4, 1, 5});
30834       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30835       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30836                                         {2, 6, 3, 7});
30837       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30838 
30839       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30840       Results.push_back(Res);
30841       return;
30842     }
30843 
30844     if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30845       if (!InVT.is128BitVector()) {
30846         // Not a 128 bit vector, but maybe type legalization will promote
30847         // it to 128 bits.
30848         if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30849           return;
30850         InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30851         if (!InVT.is128BitVector())
30852           return;
30853 
30854         // Promote the input to 128 bits. Type legalization will turn this into
30855         // zext_inreg/sext_inreg.
30856         In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30857       }
30858 
30859       // Perform custom splitting instead of the two stage extend we would get
30860       // by default.
30861       EVT LoVT, HiVT;
30862       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30863       assert(isTypeLegal(LoVT) && "Split VT not legal?");
30864 
30865       SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30866 
30867       // We need to shift the input over by half the number of elements.
30868       unsigned NumElts = InVT.getVectorNumElements();
30869       unsigned HalfNumElts = NumElts / 2;
30870       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30871       for (unsigned i = 0; i != HalfNumElts; ++i)
30872         ShufMask[i] = i + HalfNumElts;
30873 
30874       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30875       Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30876 
30877       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30878       Results.push_back(Res);
30879     }
30880     return;
30881   }
30882   case ISD::FP_TO_SINT:
30883   case ISD::STRICT_FP_TO_SINT:
30884   case ISD::FP_TO_UINT:
30885   case ISD::STRICT_FP_TO_UINT: {
30886     bool IsStrict = N->isStrictFPOpcode();
30887     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30888                     N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30889     EVT VT = N->getValueType(0);
30890     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30891     EVT SrcVT = Src.getValueType();
30892 
30893     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30894       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30895              "Unexpected type action!");
30896 
30897       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30898       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30899       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30900                                        VT.getVectorNumElements());
30901       SDValue Res;
30902       SDValue Chain;
30903       if (IsStrict) {
30904         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30905                           {N->getOperand(0), Src});
30906         Chain = Res.getValue(1);
30907       } else
30908         Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30909 
30910       // Preserve what we know about the size of the original result. If the
30911       // result is v2i32, we have to manually widen the assert.
30912       if (PromoteVT == MVT::v2i32)
30913         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
30914                           DAG.getUNDEF(MVT::v2i32));
30915 
30916       Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
30917                         Res.getValueType(), Res,
30918                         DAG.getValueType(VT.getVectorElementType()));
30919 
30920       if (PromoteVT == MVT::v2i32)
30921         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
30922                           DAG.getIntPtrConstant(0, dl));
30923 
30924       // Truncate back to the original width.
30925       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30926 
30927       // Now widen to 128 bits.
30928       unsigned NumConcats = 128 / VT.getSizeInBits();
30929       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30930                                       VT.getVectorNumElements() * NumConcats);
30931       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30932       ConcatOps[0] = Res;
30933       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30934       Results.push_back(Res);
30935       if (IsStrict)
30936         Results.push_back(Chain);
30937       return;
30938     }
30939 
30940 
30941     if (VT == MVT::v2i32) {
30942       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
30943              "Strict unsigned conversion requires AVX512");
30944       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30945       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
30946              "Unexpected type action!");
30947       if (Src.getValueType() == MVT::v2f64) {
30948         if (!IsSigned && !Subtarget.hasAVX512()) {
30949           SDValue Res =
30950               expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
30951           Results.push_back(Res);
30952           return;
30953         }
30954 
30955         unsigned Opc;
30956         if (IsStrict)
30957           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30958         else
30959           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30960 
30961         // If we have VLX we can emit a target specific FP_TO_UINT node,.
30962         if (!IsSigned && !Subtarget.hasVLX()) {
30963           // Otherwise we can defer to the generic legalizer which will widen
30964           // the input as well. This will be further widened during op
30965           // legalization to v8i32<-v8f64.
30966           // For strict nodes we'll need to widen ourselves.
30967           // FIXME: Fix the type legalizer to safely widen strict nodes?
30968           if (!IsStrict)
30969             return;
30970           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30971                             DAG.getConstantFP(0.0, dl, MVT::v2f64));
30972           Opc = N->getOpcode();
30973         }
30974         SDValue Res;
30975         SDValue Chain;
30976         if (IsStrict) {
30977           Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30978                             {N->getOperand(0), Src});
30979           Chain = Res.getValue(1);
30980         } else {
30981           Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30982         }
30983         Results.push_back(Res);
30984         if (IsStrict)
30985           Results.push_back(Chain);
30986         return;
30987       }
30988 
30989       // Custom widen strict v2f32->v2i32 by padding with zeros.
30990       // FIXME: Should generic type legalizer do this?
30991       if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30992         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30993                           DAG.getConstantFP(0.0, dl, MVT::v2f32));
30994         SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
30995                                   {N->getOperand(0), Src});
30996         Results.push_back(Res);
30997         Results.push_back(Res.getValue(1));
30998         return;
30999       }
31000 
31001       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
31002       // so early out here.
31003       return;
31004     }
31005 
31006     assert(!VT.isVector() && "Vectors should have been handled above!");
31007 
31008     if (Subtarget.hasDQI() && VT == MVT::i64 &&
31009         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
31010       assert(!Subtarget.is64Bit() && "i64 should be legal");
31011       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
31012       // If we use a 128-bit result we might need to use a target specific node.
31013       unsigned SrcElts =
31014           std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
31015       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
31016       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
31017       unsigned Opc = N->getOpcode();
31018       if (NumElts != SrcElts) {
31019         if (IsStrict)
31020           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31021         else
31022           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31023       }
31024 
31025       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
31026       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
31027                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
31028                                 ZeroIdx);
31029       SDValue Chain;
31030       if (IsStrict) {
31031         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
31032         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
31033         Chain = Res.getValue(1);
31034       } else
31035         Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
31036       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
31037       Results.push_back(Res);
31038       if (IsStrict)
31039         Results.push_back(Chain);
31040       return;
31041     }
31042 
31043     SDValue Chain;
31044     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
31045       Results.push_back(V);
31046       if (IsStrict)
31047         Results.push_back(Chain);
31048     }
31049     return;
31050   }
31051   case ISD::LRINT:
31052   case ISD::LLRINT: {
31053     if (SDValue V = LRINT_LLRINTHelper(N, DAG))
31054       Results.push_back(V);
31055     return;
31056   }
31057 
31058   case ISD::SINT_TO_FP:
31059   case ISD::STRICT_SINT_TO_FP:
31060   case ISD::UINT_TO_FP:
31061   case ISD::STRICT_UINT_TO_FP: {
31062     bool IsStrict = N->isStrictFPOpcode();
31063     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
31064                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
31065     EVT VT = N->getValueType(0);
31066     if (VT != MVT::v2f32)
31067       return;
31068     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31069     EVT SrcVT = Src.getValueType();
31070     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
31071       if (IsStrict) {
31072         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
31073                                 : X86ISD::STRICT_CVTUI2P;
31074         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
31075                                   {N->getOperand(0), Src});
31076         Results.push_back(Res);
31077         Results.push_back(Res.getValue(1));
31078       } else {
31079         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31080         Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
31081       }
31082       return;
31083     }
31084     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
31085         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
31086       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
31087       SDValue One  = DAG.getConstant(1, dl, SrcVT);
31088       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
31089                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
31090                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
31091       SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
31092       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
31093       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
31094       for (int i = 0; i != 2; ++i) {
31095         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
31096                                   SignSrc, DAG.getIntPtrConstant(i, dl));
31097         if (IsStrict)
31098           SignCvts[i] =
31099               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
31100                           {N->getOperand(0), Elt});
31101         else
31102           SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
31103       };
31104       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
31105       SDValue Slow, Chain;
31106       if (IsStrict) {
31107         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31108                             SignCvts[0].getValue(1), SignCvts[1].getValue(1));
31109         Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
31110                            {Chain, SignCvt, SignCvt});
31111         Chain = Slow.getValue(1);
31112       } else {
31113         Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
31114       }
31115       IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
31116       IsNeg =
31117           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
31118       SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
31119       Results.push_back(Cvt);
31120       if (IsStrict)
31121         Results.push_back(Chain);
31122       return;
31123     }
31124 
31125     if (SrcVT != MVT::v2i32)
31126       return;
31127 
31128     if (IsSigned || Subtarget.hasAVX512()) {
31129       if (!IsStrict)
31130         return;
31131 
31132       // Custom widen strict v2i32->v2f32 to avoid scalarization.
31133       // FIXME: Should generic type legalizer do this?
31134       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31135                         DAG.getConstant(0, dl, MVT::v2i32));
31136       SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
31137                                 {N->getOperand(0), Src});
31138       Results.push_back(Res);
31139       Results.push_back(Res.getValue(1));
31140       return;
31141     }
31142 
31143     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31144     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
31145     SDValue VBias =
31146         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
31147     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
31148                              DAG.getBitcast(MVT::v2i64, VBias));
31149     Or = DAG.getBitcast(MVT::v2f64, Or);
31150     if (IsStrict) {
31151       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
31152                                 {N->getOperand(0), Or, VBias});
31153       SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
31154                                 {MVT::v4f32, MVT::Other},
31155                                 {Sub.getValue(1), Sub});
31156       Results.push_back(Res);
31157       Results.push_back(Res.getValue(1));
31158     } else {
31159       // TODO: Are there any fast-math-flags to propagate here?
31160       SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
31161       Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
31162     }
31163     return;
31164   }
31165   case ISD::STRICT_FP_ROUND:
31166   case ISD::FP_ROUND: {
31167     bool IsStrict = N->isStrictFPOpcode();
31168     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31169     if (!isTypeLegal(Src.getValueType()))
31170       return;
31171     SDValue V;
31172     if (IsStrict)
31173       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
31174                       {N->getOperand(0), N->getOperand(1)});
31175     else
31176       V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
31177     Results.push_back(V);
31178     if (IsStrict)
31179       Results.push_back(V.getValue(1));
31180     return;
31181   }
31182   case ISD::FP_EXTEND:
31183   case ISD::STRICT_FP_EXTEND: {
31184     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
31185     // No other ValueType for FP_EXTEND should reach this point.
31186     assert(N->getValueType(0) == MVT::v2f32 &&
31187            "Do not know how to legalize this Node");
31188     return;
31189   }
31190   case ISD::INTRINSIC_W_CHAIN: {
31191     unsigned IntNo = N->getConstantOperandVal(1);
31192     switch (IntNo) {
31193     default : llvm_unreachable("Do not know how to custom type "
31194                                "legalize this intrinsic operation!");
31195     case Intrinsic::x86_rdtsc:
31196       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31197                                      Results);
31198     case Intrinsic::x86_rdtscp:
31199       return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31200                                      Results);
31201     case Intrinsic::x86_rdpmc:
31202       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31203                                   Results);
31204       return;
31205     case Intrinsic::x86_xgetbv:
31206       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31207                                   Results);
31208       return;
31209     }
31210   }
31211   case ISD::READCYCLECOUNTER: {
31212     return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31213   }
31214   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31215     EVT T = N->getValueType(0);
31216     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
31217     bool Regs64bit = T == MVT::i128;
31218     assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
31219            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
31220     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31221     SDValue cpInL, cpInH;
31222     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31223                         DAG.getConstant(0, dl, HalfT));
31224     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31225                         DAG.getConstant(1, dl, HalfT));
31226     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31227                              Regs64bit ? X86::RAX : X86::EAX,
31228                              cpInL, SDValue());
31229     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31230                              Regs64bit ? X86::RDX : X86::EDX,
31231                              cpInH, cpInL.getValue(1));
31232     SDValue swapInL, swapInH;
31233     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31234                           DAG.getConstant(0, dl, HalfT));
31235     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31236                           DAG.getConstant(1, dl, HalfT));
31237     swapInH =
31238         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31239                          swapInH, cpInH.getValue(1));
31240 
31241     // In 64-bit mode we might need the base pointer in RBX, but we can't know
31242     // until later. So we keep the RBX input in a vreg and use a custom
31243     // inserter.
31244     // Since RBX will be a reserved register the register allocator will not
31245     // make sure its value will be properly saved and restored around this
31246     // live-range.
31247     SDValue Result;
31248     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31249     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31250     if (Regs64bit) {
31251       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31252                        swapInH.getValue(1)};
31253       Result =
31254           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31255     } else {
31256       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31257                                  swapInH.getValue(1));
31258       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31259                        swapInL.getValue(1)};
31260       Result =
31261           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31262     }
31263 
31264     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31265                                         Regs64bit ? X86::RAX : X86::EAX,
31266                                         HalfT, Result.getValue(1));
31267     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31268                                         Regs64bit ? X86::RDX : X86::EDX,
31269                                         HalfT, cpOutL.getValue(2));
31270     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31271 
31272     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31273                                         MVT::i32, cpOutH.getValue(2));
31274     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31275     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31276 
31277     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31278     Results.push_back(Success);
31279     Results.push_back(EFLAGS.getValue(1));
31280     return;
31281   }
31282   case ISD::ATOMIC_LOAD: {
31283     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31284     bool NoImplicitFloatOps =
31285         DAG.getMachineFunction().getFunction().hasFnAttribute(
31286             Attribute::NoImplicitFloat);
31287     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31288       auto *Node = cast<AtomicSDNode>(N);
31289       if (Subtarget.hasSSE1()) {
31290         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31291         // Then extract the lower 64-bits.
31292         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31293         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31294         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31295         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31296                                              MVT::i64, Node->getMemOperand());
31297         if (Subtarget.hasSSE2()) {
31298           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31299                                     DAG.getIntPtrConstant(0, dl));
31300           Results.push_back(Res);
31301           Results.push_back(Ld.getValue(1));
31302           return;
31303         }
31304         // We use an alternative sequence for SSE1 that extracts as v2f32 and
31305         // then casts to i64. This avoids a 128-bit stack temporary being
31306         // created by type legalization if we were to cast v4f32->v2i64.
31307         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31308                                   DAG.getIntPtrConstant(0, dl));
31309         Res = DAG.getBitcast(MVT::i64, Res);
31310         Results.push_back(Res);
31311         Results.push_back(Ld.getValue(1));
31312         return;
31313       }
31314       if (Subtarget.hasX87()) {
31315         // First load this into an 80-bit X87 register. This will put the whole
31316         // integer into the significand.
31317         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31318         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31319         SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31320                                                  dl, Tys, Ops, MVT::i64,
31321                                                  Node->getMemOperand());
31322         SDValue Chain = Result.getValue(1);
31323 
31324         // Now store the X87 register to a stack temporary and convert to i64.
31325         // This store is not atomic and doesn't need to be.
31326         // FIXME: We don't need a stack temporary if the result of the load
31327         // is already being stored. We could just directly store there.
31328         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31329         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31330         MachinePointerInfo MPI =
31331             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31332         SDValue StoreOps[] = { Chain, Result, StackPtr };
31333         Chain = DAG.getMemIntrinsicNode(
31334             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31335             MPI, None /*Align*/, MachineMemOperand::MOStore);
31336 
31337         // Finally load the value back from the stack temporary and return it.
31338         // This load is not atomic and doesn't need to be.
31339         // This load will be further type legalized.
31340         Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31341         Results.push_back(Result);
31342         Results.push_back(Result.getValue(1));
31343         return;
31344       }
31345     }
31346     // TODO: Use MOVLPS when SSE1 is available?
31347     // Delegate to generic TypeLegalization. Situations we can really handle
31348     // should have already been dealt with by AtomicExpandPass.cpp.
31349     break;
31350   }
31351   case ISD::ATOMIC_SWAP:
31352   case ISD::ATOMIC_LOAD_ADD:
31353   case ISD::ATOMIC_LOAD_SUB:
31354   case ISD::ATOMIC_LOAD_AND:
31355   case ISD::ATOMIC_LOAD_OR:
31356   case ISD::ATOMIC_LOAD_XOR:
31357   case ISD::ATOMIC_LOAD_NAND:
31358   case ISD::ATOMIC_LOAD_MIN:
31359   case ISD::ATOMIC_LOAD_MAX:
31360   case ISD::ATOMIC_LOAD_UMIN:
31361   case ISD::ATOMIC_LOAD_UMAX:
31362     // Delegate to generic TypeLegalization. Situations we can really handle
31363     // should have already been dealt with by AtomicExpandPass.cpp.
31364     break;
31365 
31366   case ISD::BITCAST: {
31367     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31368     EVT DstVT = N->getValueType(0);
31369     EVT SrcVT = N->getOperand(0).getValueType();
31370 
31371     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31372     // we can split using the k-register rather than memory.
31373     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31374       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31375       SDValue Lo, Hi;
31376       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31377       Lo = DAG.getBitcast(MVT::i32, Lo);
31378       Hi = DAG.getBitcast(MVT::i32, Hi);
31379       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31380       Results.push_back(Res);
31381       return;
31382     }
31383 
31384     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31385       // FIXME: Use v4f32 for SSE1?
31386       assert(Subtarget.hasSSE2() && "Requires SSE2");
31387       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
31388              "Unexpected type action!");
31389       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31390       SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31391                                 N->getOperand(0));
31392       Res = DAG.getBitcast(WideVT, Res);
31393       Results.push_back(Res);
31394       return;
31395     }
31396 
31397     return;
31398   }
31399   case ISD::MGATHER: {
31400     EVT VT = N->getValueType(0);
31401     if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
31402         (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
31403       auto *Gather = cast<MaskedGatherSDNode>(N);
31404       SDValue Index = Gather->getIndex();
31405       if (Index.getValueType() != MVT::v2i64)
31406         return;
31407       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31408              "Unexpected type action!");
31409       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31410       SDValue Mask = Gather->getMask();
31411       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31412       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
31413                                      Gather->getPassThru(),
31414                                      DAG.getUNDEF(VT));
31415       if (!Subtarget.hasVLX()) {
31416         // We need to widen the mask, but the instruction will only use 2
31417         // of its elements. So we can use undef.
31418         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
31419                            DAG.getUNDEF(MVT::v2i1));
31420         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
31421       }
31422       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
31423                         Gather->getBasePtr(), Index, Gather->getScale() };
31424       SDValue Res = DAG.getMemIntrinsicNode(
31425           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
31426           Gather->getMemoryVT(), Gather->getMemOperand());
31427       Results.push_back(Res);
31428       Results.push_back(Res.getValue(1));
31429       return;
31430     }
31431     return;
31432   }
31433   case ISD::LOAD: {
31434     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
31435     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
31436     // cast since type legalization will try to use an i64 load.
31437     MVT VT = N->getSimpleValueType(0);
31438     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
31439     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31440            "Unexpected type action!");
31441     if (!ISD::isNON_EXTLoad(N))
31442       return;
31443     auto *Ld = cast<LoadSDNode>(N);
31444     if (Subtarget.hasSSE2()) {
31445       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
31446       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
31447                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
31448                                 Ld->getMemOperand()->getFlags());
31449       SDValue Chain = Res.getValue(1);
31450       MVT VecVT = MVT::getVectorVT(LdVT, 2);
31451       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
31452       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31453       Res = DAG.getBitcast(WideVT, Res);
31454       Results.push_back(Res);
31455       Results.push_back(Chain);
31456       return;
31457     }
31458     assert(Subtarget.hasSSE1() && "Expected SSE");
31459     SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
31460     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
31461     SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31462                                           MVT::i64, Ld->getMemOperand());
31463     Results.push_back(Res);
31464     Results.push_back(Res.getValue(1));
31465     return;
31466   }
31467   case ISD::ADDRSPACECAST: {
31468     SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
31469     Results.push_back(V);
31470     return;
31471   }
31472   case ISD::BITREVERSE:
31473     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31474     assert(Subtarget.hasXOP() && "Expected XOP");
31475     // We can use VPPERM by copying to a vector register and back. We'll need
31476     // to move the scalar in two i32 pieces.
31477     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
31478     return;
31479   }
31480 }
31481 
31482 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
31483   switch ((X86ISD::NodeType)Opcode) {
31484   case X86ISD::FIRST_NUMBER:       break;
31485 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
31486   NODE_NAME_CASE(BSF)
31487   NODE_NAME_CASE(BSR)
31488   NODE_NAME_CASE(FSHL)
31489   NODE_NAME_CASE(FSHR)
31490   NODE_NAME_CASE(FAND)
31491   NODE_NAME_CASE(FANDN)
31492   NODE_NAME_CASE(FOR)
31493   NODE_NAME_CASE(FXOR)
31494   NODE_NAME_CASE(FILD)
31495   NODE_NAME_CASE(FIST)
31496   NODE_NAME_CASE(FP_TO_INT_IN_MEM)
31497   NODE_NAME_CASE(FLD)
31498   NODE_NAME_CASE(FST)
31499   NODE_NAME_CASE(CALL)
31500   NODE_NAME_CASE(CALL_RVMARKER)
31501   NODE_NAME_CASE(BT)
31502   NODE_NAME_CASE(CMP)
31503   NODE_NAME_CASE(FCMP)
31504   NODE_NAME_CASE(STRICT_FCMP)
31505   NODE_NAME_CASE(STRICT_FCMPS)
31506   NODE_NAME_CASE(COMI)
31507   NODE_NAME_CASE(UCOMI)
31508   NODE_NAME_CASE(CMPM)
31509   NODE_NAME_CASE(CMPMM)
31510   NODE_NAME_CASE(STRICT_CMPM)
31511   NODE_NAME_CASE(CMPMM_SAE)
31512   NODE_NAME_CASE(SETCC)
31513   NODE_NAME_CASE(SETCC_CARRY)
31514   NODE_NAME_CASE(FSETCC)
31515   NODE_NAME_CASE(FSETCCM)
31516   NODE_NAME_CASE(FSETCCM_SAE)
31517   NODE_NAME_CASE(CMOV)
31518   NODE_NAME_CASE(BRCOND)
31519   NODE_NAME_CASE(RET_FLAG)
31520   NODE_NAME_CASE(IRET)
31521   NODE_NAME_CASE(REP_STOS)
31522   NODE_NAME_CASE(REP_MOVS)
31523   NODE_NAME_CASE(GlobalBaseReg)
31524   NODE_NAME_CASE(Wrapper)
31525   NODE_NAME_CASE(WrapperRIP)
31526   NODE_NAME_CASE(MOVQ2DQ)
31527   NODE_NAME_CASE(MOVDQ2Q)
31528   NODE_NAME_CASE(MMX_MOVD2W)
31529   NODE_NAME_CASE(MMX_MOVW2D)
31530   NODE_NAME_CASE(PEXTRB)
31531   NODE_NAME_CASE(PEXTRW)
31532   NODE_NAME_CASE(INSERTPS)
31533   NODE_NAME_CASE(PINSRB)
31534   NODE_NAME_CASE(PINSRW)
31535   NODE_NAME_CASE(PSHUFB)
31536   NODE_NAME_CASE(ANDNP)
31537   NODE_NAME_CASE(BLENDI)
31538   NODE_NAME_CASE(BLENDV)
31539   NODE_NAME_CASE(HADD)
31540   NODE_NAME_CASE(HSUB)
31541   NODE_NAME_CASE(FHADD)
31542   NODE_NAME_CASE(FHSUB)
31543   NODE_NAME_CASE(CONFLICT)
31544   NODE_NAME_CASE(FMAX)
31545   NODE_NAME_CASE(FMAXS)
31546   NODE_NAME_CASE(FMAX_SAE)
31547   NODE_NAME_CASE(FMAXS_SAE)
31548   NODE_NAME_CASE(FMIN)
31549   NODE_NAME_CASE(FMINS)
31550   NODE_NAME_CASE(FMIN_SAE)
31551   NODE_NAME_CASE(FMINS_SAE)
31552   NODE_NAME_CASE(FMAXC)
31553   NODE_NAME_CASE(FMINC)
31554   NODE_NAME_CASE(FRSQRT)
31555   NODE_NAME_CASE(FRCP)
31556   NODE_NAME_CASE(EXTRQI)
31557   NODE_NAME_CASE(INSERTQI)
31558   NODE_NAME_CASE(TLSADDR)
31559   NODE_NAME_CASE(TLSBASEADDR)
31560   NODE_NAME_CASE(TLSCALL)
31561   NODE_NAME_CASE(EH_SJLJ_SETJMP)
31562   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
31563   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
31564   NODE_NAME_CASE(EH_RETURN)
31565   NODE_NAME_CASE(TC_RETURN)
31566   NODE_NAME_CASE(FNSTCW16m)
31567   NODE_NAME_CASE(FLDCW16m)
31568   NODE_NAME_CASE(LCMPXCHG_DAG)
31569   NODE_NAME_CASE(LCMPXCHG8_DAG)
31570   NODE_NAME_CASE(LCMPXCHG16_DAG)
31571   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
31572   NODE_NAME_CASE(LADD)
31573   NODE_NAME_CASE(LSUB)
31574   NODE_NAME_CASE(LOR)
31575   NODE_NAME_CASE(LXOR)
31576   NODE_NAME_CASE(LAND)
31577   NODE_NAME_CASE(VZEXT_MOVL)
31578   NODE_NAME_CASE(VZEXT_LOAD)
31579   NODE_NAME_CASE(VEXTRACT_STORE)
31580   NODE_NAME_CASE(VTRUNC)
31581   NODE_NAME_CASE(VTRUNCS)
31582   NODE_NAME_CASE(VTRUNCUS)
31583   NODE_NAME_CASE(VMTRUNC)
31584   NODE_NAME_CASE(VMTRUNCS)
31585   NODE_NAME_CASE(VMTRUNCUS)
31586   NODE_NAME_CASE(VTRUNCSTORES)
31587   NODE_NAME_CASE(VTRUNCSTOREUS)
31588   NODE_NAME_CASE(VMTRUNCSTORES)
31589   NODE_NAME_CASE(VMTRUNCSTOREUS)
31590   NODE_NAME_CASE(VFPEXT)
31591   NODE_NAME_CASE(STRICT_VFPEXT)
31592   NODE_NAME_CASE(VFPEXT_SAE)
31593   NODE_NAME_CASE(VFPEXTS)
31594   NODE_NAME_CASE(VFPEXTS_SAE)
31595   NODE_NAME_CASE(VFPROUND)
31596   NODE_NAME_CASE(STRICT_VFPROUND)
31597   NODE_NAME_CASE(VMFPROUND)
31598   NODE_NAME_CASE(VFPROUND_RND)
31599   NODE_NAME_CASE(VFPROUNDS)
31600   NODE_NAME_CASE(VFPROUNDS_RND)
31601   NODE_NAME_CASE(VSHLDQ)
31602   NODE_NAME_CASE(VSRLDQ)
31603   NODE_NAME_CASE(VSHL)
31604   NODE_NAME_CASE(VSRL)
31605   NODE_NAME_CASE(VSRA)
31606   NODE_NAME_CASE(VSHLI)
31607   NODE_NAME_CASE(VSRLI)
31608   NODE_NAME_CASE(VSRAI)
31609   NODE_NAME_CASE(VSHLV)
31610   NODE_NAME_CASE(VSRLV)
31611   NODE_NAME_CASE(VSRAV)
31612   NODE_NAME_CASE(VROTLI)
31613   NODE_NAME_CASE(VROTRI)
31614   NODE_NAME_CASE(VPPERM)
31615   NODE_NAME_CASE(CMPP)
31616   NODE_NAME_CASE(STRICT_CMPP)
31617   NODE_NAME_CASE(PCMPEQ)
31618   NODE_NAME_CASE(PCMPGT)
31619   NODE_NAME_CASE(PHMINPOS)
31620   NODE_NAME_CASE(ADD)
31621   NODE_NAME_CASE(SUB)
31622   NODE_NAME_CASE(ADC)
31623   NODE_NAME_CASE(SBB)
31624   NODE_NAME_CASE(SMUL)
31625   NODE_NAME_CASE(UMUL)
31626   NODE_NAME_CASE(OR)
31627   NODE_NAME_CASE(XOR)
31628   NODE_NAME_CASE(AND)
31629   NODE_NAME_CASE(BEXTR)
31630   NODE_NAME_CASE(BEXTRI)
31631   NODE_NAME_CASE(BZHI)
31632   NODE_NAME_CASE(PDEP)
31633   NODE_NAME_CASE(PEXT)
31634   NODE_NAME_CASE(MUL_IMM)
31635   NODE_NAME_CASE(MOVMSK)
31636   NODE_NAME_CASE(PTEST)
31637   NODE_NAME_CASE(TESTP)
31638   NODE_NAME_CASE(KORTEST)
31639   NODE_NAME_CASE(KTEST)
31640   NODE_NAME_CASE(KADD)
31641   NODE_NAME_CASE(KSHIFTL)
31642   NODE_NAME_CASE(KSHIFTR)
31643   NODE_NAME_CASE(PACKSS)
31644   NODE_NAME_CASE(PACKUS)
31645   NODE_NAME_CASE(PALIGNR)
31646   NODE_NAME_CASE(VALIGN)
31647   NODE_NAME_CASE(VSHLD)
31648   NODE_NAME_CASE(VSHRD)
31649   NODE_NAME_CASE(VSHLDV)
31650   NODE_NAME_CASE(VSHRDV)
31651   NODE_NAME_CASE(PSHUFD)
31652   NODE_NAME_CASE(PSHUFHW)
31653   NODE_NAME_CASE(PSHUFLW)
31654   NODE_NAME_CASE(SHUFP)
31655   NODE_NAME_CASE(SHUF128)
31656   NODE_NAME_CASE(MOVLHPS)
31657   NODE_NAME_CASE(MOVHLPS)
31658   NODE_NAME_CASE(MOVDDUP)
31659   NODE_NAME_CASE(MOVSHDUP)
31660   NODE_NAME_CASE(MOVSLDUP)
31661   NODE_NAME_CASE(MOVSD)
31662   NODE_NAME_CASE(MOVSS)
31663   NODE_NAME_CASE(UNPCKL)
31664   NODE_NAME_CASE(UNPCKH)
31665   NODE_NAME_CASE(VBROADCAST)
31666   NODE_NAME_CASE(VBROADCAST_LOAD)
31667   NODE_NAME_CASE(VBROADCASTM)
31668   NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
31669   NODE_NAME_CASE(VPERMILPV)
31670   NODE_NAME_CASE(VPERMILPI)
31671   NODE_NAME_CASE(VPERM2X128)
31672   NODE_NAME_CASE(VPERMV)
31673   NODE_NAME_CASE(VPERMV3)
31674   NODE_NAME_CASE(VPERMI)
31675   NODE_NAME_CASE(VPTERNLOG)
31676   NODE_NAME_CASE(VFIXUPIMM)
31677   NODE_NAME_CASE(VFIXUPIMM_SAE)
31678   NODE_NAME_CASE(VFIXUPIMMS)
31679   NODE_NAME_CASE(VFIXUPIMMS_SAE)
31680   NODE_NAME_CASE(VRANGE)
31681   NODE_NAME_CASE(VRANGE_SAE)
31682   NODE_NAME_CASE(VRANGES)
31683   NODE_NAME_CASE(VRANGES_SAE)
31684   NODE_NAME_CASE(PMULUDQ)
31685   NODE_NAME_CASE(PMULDQ)
31686   NODE_NAME_CASE(PSADBW)
31687   NODE_NAME_CASE(DBPSADBW)
31688   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
31689   NODE_NAME_CASE(VAARG_64)
31690   NODE_NAME_CASE(VAARG_X32)
31691   NODE_NAME_CASE(WIN_ALLOCA)
31692   NODE_NAME_CASE(MEMBARRIER)
31693   NODE_NAME_CASE(MFENCE)
31694   NODE_NAME_CASE(SEG_ALLOCA)
31695   NODE_NAME_CASE(PROBED_ALLOCA)
31696   NODE_NAME_CASE(RDRAND)
31697   NODE_NAME_CASE(RDSEED)
31698   NODE_NAME_CASE(RDPKRU)
31699   NODE_NAME_CASE(WRPKRU)
31700   NODE_NAME_CASE(VPMADDUBSW)
31701   NODE_NAME_CASE(VPMADDWD)
31702   NODE_NAME_CASE(VPSHA)
31703   NODE_NAME_CASE(VPSHL)
31704   NODE_NAME_CASE(VPCOM)
31705   NODE_NAME_CASE(VPCOMU)
31706   NODE_NAME_CASE(VPERMIL2)
31707   NODE_NAME_CASE(FMSUB)
31708   NODE_NAME_CASE(STRICT_FMSUB)
31709   NODE_NAME_CASE(FNMADD)
31710   NODE_NAME_CASE(STRICT_FNMADD)
31711   NODE_NAME_CASE(FNMSUB)
31712   NODE_NAME_CASE(STRICT_FNMSUB)
31713   NODE_NAME_CASE(FMADDSUB)
31714   NODE_NAME_CASE(FMSUBADD)
31715   NODE_NAME_CASE(FMADD_RND)
31716   NODE_NAME_CASE(FNMADD_RND)
31717   NODE_NAME_CASE(FMSUB_RND)
31718   NODE_NAME_CASE(FNMSUB_RND)
31719   NODE_NAME_CASE(FMADDSUB_RND)
31720   NODE_NAME_CASE(FMSUBADD_RND)
31721   NODE_NAME_CASE(VPMADD52H)
31722   NODE_NAME_CASE(VPMADD52L)
31723   NODE_NAME_CASE(VRNDSCALE)
31724   NODE_NAME_CASE(STRICT_VRNDSCALE)
31725   NODE_NAME_CASE(VRNDSCALE_SAE)
31726   NODE_NAME_CASE(VRNDSCALES)
31727   NODE_NAME_CASE(VRNDSCALES_SAE)
31728   NODE_NAME_CASE(VREDUCE)
31729   NODE_NAME_CASE(VREDUCE_SAE)
31730   NODE_NAME_CASE(VREDUCES)
31731   NODE_NAME_CASE(VREDUCES_SAE)
31732   NODE_NAME_CASE(VGETMANT)
31733   NODE_NAME_CASE(VGETMANT_SAE)
31734   NODE_NAME_CASE(VGETMANTS)
31735   NODE_NAME_CASE(VGETMANTS_SAE)
31736   NODE_NAME_CASE(PCMPESTR)
31737   NODE_NAME_CASE(PCMPISTR)
31738   NODE_NAME_CASE(XTEST)
31739   NODE_NAME_CASE(COMPRESS)
31740   NODE_NAME_CASE(EXPAND)
31741   NODE_NAME_CASE(SELECTS)
31742   NODE_NAME_CASE(ADDSUB)
31743   NODE_NAME_CASE(RCP14)
31744   NODE_NAME_CASE(RCP14S)
31745   NODE_NAME_CASE(RCP28)
31746   NODE_NAME_CASE(RCP28_SAE)
31747   NODE_NAME_CASE(RCP28S)
31748   NODE_NAME_CASE(RCP28S_SAE)
31749   NODE_NAME_CASE(EXP2)
31750   NODE_NAME_CASE(EXP2_SAE)
31751   NODE_NAME_CASE(RSQRT14)
31752   NODE_NAME_CASE(RSQRT14S)
31753   NODE_NAME_CASE(RSQRT28)
31754   NODE_NAME_CASE(RSQRT28_SAE)
31755   NODE_NAME_CASE(RSQRT28S)
31756   NODE_NAME_CASE(RSQRT28S_SAE)
31757   NODE_NAME_CASE(FADD_RND)
31758   NODE_NAME_CASE(FADDS)
31759   NODE_NAME_CASE(FADDS_RND)
31760   NODE_NAME_CASE(FSUB_RND)
31761   NODE_NAME_CASE(FSUBS)
31762   NODE_NAME_CASE(FSUBS_RND)
31763   NODE_NAME_CASE(FMUL_RND)
31764   NODE_NAME_CASE(FMULS)
31765   NODE_NAME_CASE(FMULS_RND)
31766   NODE_NAME_CASE(FDIV_RND)
31767   NODE_NAME_CASE(FDIVS)
31768   NODE_NAME_CASE(FDIVS_RND)
31769   NODE_NAME_CASE(FSQRT_RND)
31770   NODE_NAME_CASE(FSQRTS)
31771   NODE_NAME_CASE(FSQRTS_RND)
31772   NODE_NAME_CASE(FGETEXP)
31773   NODE_NAME_CASE(FGETEXP_SAE)
31774   NODE_NAME_CASE(FGETEXPS)
31775   NODE_NAME_CASE(FGETEXPS_SAE)
31776   NODE_NAME_CASE(SCALEF)
31777   NODE_NAME_CASE(SCALEF_RND)
31778   NODE_NAME_CASE(SCALEFS)
31779   NODE_NAME_CASE(SCALEFS_RND)
31780   NODE_NAME_CASE(AVG)
31781   NODE_NAME_CASE(MULHRS)
31782   NODE_NAME_CASE(SINT_TO_FP_RND)
31783   NODE_NAME_CASE(UINT_TO_FP_RND)
31784   NODE_NAME_CASE(CVTTP2SI)
31785   NODE_NAME_CASE(CVTTP2UI)
31786   NODE_NAME_CASE(STRICT_CVTTP2SI)
31787   NODE_NAME_CASE(STRICT_CVTTP2UI)
31788   NODE_NAME_CASE(MCVTTP2SI)
31789   NODE_NAME_CASE(MCVTTP2UI)
31790   NODE_NAME_CASE(CVTTP2SI_SAE)
31791   NODE_NAME_CASE(CVTTP2UI_SAE)
31792   NODE_NAME_CASE(CVTTS2SI)
31793   NODE_NAME_CASE(CVTTS2UI)
31794   NODE_NAME_CASE(CVTTS2SI_SAE)
31795   NODE_NAME_CASE(CVTTS2UI_SAE)
31796   NODE_NAME_CASE(CVTSI2P)
31797   NODE_NAME_CASE(CVTUI2P)
31798   NODE_NAME_CASE(STRICT_CVTSI2P)
31799   NODE_NAME_CASE(STRICT_CVTUI2P)
31800   NODE_NAME_CASE(MCVTSI2P)
31801   NODE_NAME_CASE(MCVTUI2P)
31802   NODE_NAME_CASE(VFPCLASS)
31803   NODE_NAME_CASE(VFPCLASSS)
31804   NODE_NAME_CASE(MULTISHIFT)
31805   NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31806   NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31807   NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31808   NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31809   NODE_NAME_CASE(CVTPS2PH)
31810   NODE_NAME_CASE(STRICT_CVTPS2PH)
31811   NODE_NAME_CASE(MCVTPS2PH)
31812   NODE_NAME_CASE(CVTPH2PS)
31813   NODE_NAME_CASE(STRICT_CVTPH2PS)
31814   NODE_NAME_CASE(CVTPH2PS_SAE)
31815   NODE_NAME_CASE(CVTP2SI)
31816   NODE_NAME_CASE(CVTP2UI)
31817   NODE_NAME_CASE(MCVTP2SI)
31818   NODE_NAME_CASE(MCVTP2UI)
31819   NODE_NAME_CASE(CVTP2SI_RND)
31820   NODE_NAME_CASE(CVTP2UI_RND)
31821   NODE_NAME_CASE(CVTS2SI)
31822   NODE_NAME_CASE(CVTS2UI)
31823   NODE_NAME_CASE(CVTS2SI_RND)
31824   NODE_NAME_CASE(CVTS2UI_RND)
31825   NODE_NAME_CASE(CVTNE2PS2BF16)
31826   NODE_NAME_CASE(CVTNEPS2BF16)
31827   NODE_NAME_CASE(MCVTNEPS2BF16)
31828   NODE_NAME_CASE(DPBF16PS)
31829   NODE_NAME_CASE(LWPINS)
31830   NODE_NAME_CASE(MGATHER)
31831   NODE_NAME_CASE(MSCATTER)
31832   NODE_NAME_CASE(VPDPBUSD)
31833   NODE_NAME_CASE(VPDPBUSDS)
31834   NODE_NAME_CASE(VPDPWSSD)
31835   NODE_NAME_CASE(VPDPWSSDS)
31836   NODE_NAME_CASE(VPSHUFBITQMB)
31837   NODE_NAME_CASE(GF2P8MULB)
31838   NODE_NAME_CASE(GF2P8AFFINEQB)
31839   NODE_NAME_CASE(GF2P8AFFINEINVQB)
31840   NODE_NAME_CASE(NT_CALL)
31841   NODE_NAME_CASE(NT_BRIND)
31842   NODE_NAME_CASE(UMWAIT)
31843   NODE_NAME_CASE(TPAUSE)
31844   NODE_NAME_CASE(ENQCMD)
31845   NODE_NAME_CASE(ENQCMDS)
31846   NODE_NAME_CASE(VP2INTERSECT)
31847   NODE_NAME_CASE(AESENC128KL)
31848   NODE_NAME_CASE(AESDEC128KL)
31849   NODE_NAME_CASE(AESENC256KL)
31850   NODE_NAME_CASE(AESDEC256KL)
31851   NODE_NAME_CASE(AESENCWIDE128KL)
31852   NODE_NAME_CASE(AESDECWIDE128KL)
31853   NODE_NAME_CASE(AESENCWIDE256KL)
31854   NODE_NAME_CASE(AESDECWIDE256KL)
31855   NODE_NAME_CASE(TESTUI)
31856   }
31857   return nullptr;
31858 #undef NODE_NAME_CASE
31859 }
31860 
31861 /// Return true if the addressing mode represented by AM is legal for this
31862 /// target, for a load/store of the specified type.
31863 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31864                                               const AddrMode &AM, Type *Ty,
31865                                               unsigned AS,
31866                                               Instruction *I) const {
31867   // X86 supports extremely general addressing modes.
31868   CodeModel::Model M = getTargetMachine().getCodeModel();
31869 
31870   // X86 allows a sign-extended 32-bit immediate field as a displacement.
31871   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31872     return false;
31873 
31874   if (AM.BaseGV) {
31875     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31876 
31877     // If a reference to this global requires an extra load, we can't fold it.
31878     if (isGlobalStubReference(GVFlags))
31879       return false;
31880 
31881     // If BaseGV requires a register for the PIC base, we cannot also have a
31882     // BaseReg specified.
31883     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31884       return false;
31885 
31886     // If lower 4G is not available, then we must use rip-relative addressing.
31887     if ((M != CodeModel::Small || isPositionIndependent()) &&
31888         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31889       return false;
31890   }
31891 
31892   switch (AM.Scale) {
31893   case 0:
31894   case 1:
31895   case 2:
31896   case 4:
31897   case 8:
31898     // These scales always work.
31899     break;
31900   case 3:
31901   case 5:
31902   case 9:
31903     // These scales are formed with basereg+scalereg.  Only accept if there is
31904     // no basereg yet.
31905     if (AM.HasBaseReg)
31906       return false;
31907     break;
31908   default:  // Other stuff never works.
31909     return false;
31910   }
31911 
31912   return true;
31913 }
31914 
31915 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31916   unsigned Bits = Ty->getScalarSizeInBits();
31917 
31918   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31919   // particularly cheaper than those without.
31920   if (Bits == 8)
31921     return false;
31922 
31923   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31924   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31925   if (Subtarget.hasXOP() &&
31926       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31927     return false;
31928 
31929   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31930   // shifts just as cheap as scalar ones.
31931   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31932     return false;
31933 
31934   // AVX512BW has shifts such as vpsllvw.
31935   if (Subtarget.hasBWI() && Bits == 16)
31936       return false;
31937 
31938   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31939   // fully general vector.
31940   return true;
31941 }
31942 
31943 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31944   switch (Opcode) {
31945   // These are non-commutative binops.
31946   // TODO: Add more X86ISD opcodes once we have test coverage.
31947   case X86ISD::ANDNP:
31948   case X86ISD::PCMPGT:
31949   case X86ISD::FMAX:
31950   case X86ISD::FMIN:
31951   case X86ISD::FANDN:
31952     return true;
31953   }
31954 
31955   return TargetLoweringBase::isBinOp(Opcode);
31956 }
31957 
31958 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31959   switch (Opcode) {
31960   // TODO: Add more X86ISD opcodes once we have test coverage.
31961   case X86ISD::PCMPEQ:
31962   case X86ISD::PMULDQ:
31963   case X86ISD::PMULUDQ:
31964   case X86ISD::FMAXC:
31965   case X86ISD::FMINC:
31966   case X86ISD::FAND:
31967   case X86ISD::FOR:
31968   case X86ISD::FXOR:
31969     return true;
31970   }
31971 
31972   return TargetLoweringBase::isCommutativeBinOp(Opcode);
31973 }
31974 
31975 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31976   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31977     return false;
31978   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31979   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31980   return NumBits1 > NumBits2;
31981 }
31982 
31983 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31984   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31985     return false;
31986 
31987   if (!isTypeLegal(EVT::getEVT(Ty1)))
31988     return false;
31989 
31990   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
31991 
31992   // Assuming the caller doesn't have a zeroext or signext return parameter,
31993   // truncation all the way down to i1 is valid.
31994   return true;
31995 }
31996 
31997 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
31998   return isInt<32>(Imm);
31999 }
32000 
32001 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
32002   // Can also use sub to handle negated immediates.
32003   return isInt<32>(Imm);
32004 }
32005 
32006 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
32007   return isInt<32>(Imm);
32008 }
32009 
32010 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
32011   if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
32012     return false;
32013   unsigned NumBits1 = VT1.getSizeInBits();
32014   unsigned NumBits2 = VT2.getSizeInBits();
32015   return NumBits1 > NumBits2;
32016 }
32017 
32018 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
32019   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32020   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
32021 }
32022 
32023 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
32024   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32025   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
32026 }
32027 
32028 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
32029   EVT VT1 = Val.getValueType();
32030   if (isZExtFree(VT1, VT2))
32031     return true;
32032 
32033   if (Val.getOpcode() != ISD::LOAD)
32034     return false;
32035 
32036   if (!VT1.isSimple() || !VT1.isInteger() ||
32037       !VT2.isSimple() || !VT2.isInteger())
32038     return false;
32039 
32040   switch (VT1.getSimpleVT().SimpleTy) {
32041   default: break;
32042   case MVT::i8:
32043   case MVT::i16:
32044   case MVT::i32:
32045     // X86 has 8, 16, and 32-bit zero-extending loads.
32046     return true;
32047   }
32048 
32049   return false;
32050 }
32051 
32052 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
32053                                            SmallVectorImpl<Use *> &Ops) const {
32054   // A uniform shift amount in a vector shift or funnel shift may be much
32055   // cheaper than a generic variable vector shift, so make that pattern visible
32056   // to SDAG by sinking the shuffle instruction next to the shift.
32057   int ShiftAmountOpNum = -1;
32058   if (I->isShift())
32059     ShiftAmountOpNum = 1;
32060   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
32061     if (II->getIntrinsicID() == Intrinsic::fshl ||
32062         II->getIntrinsicID() == Intrinsic::fshr)
32063       ShiftAmountOpNum = 2;
32064   }
32065 
32066   if (ShiftAmountOpNum == -1)
32067     return false;
32068 
32069   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
32070   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
32071       isVectorShiftByScalarCheap(I->getType())) {
32072     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
32073     return true;
32074   }
32075 
32076   return false;
32077 }
32078 
32079 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
32080   if (!Subtarget.is64Bit())
32081     return false;
32082   return TargetLowering::shouldConvertPhiType(From, To);
32083 }
32084 
32085 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
32086   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
32087     return false;
32088 
32089   EVT SrcVT = ExtVal.getOperand(0).getValueType();
32090 
32091   // There is no extending load for vXi1.
32092   if (SrcVT.getScalarType() == MVT::i1)
32093     return false;
32094 
32095   return true;
32096 }
32097 
32098 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
32099                                                    EVT VT) const {
32100   if (!Subtarget.hasAnyFMA())
32101     return false;
32102 
32103   VT = VT.getScalarType();
32104 
32105   if (!VT.isSimple())
32106     return false;
32107 
32108   switch (VT.getSimpleVT().SimpleTy) {
32109   case MVT::f32:
32110   case MVT::f64:
32111     return true;
32112   default:
32113     break;
32114   }
32115 
32116   return false;
32117 }
32118 
32119 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
32120   // i16 instructions are longer (0x66 prefix) and potentially slower.
32121   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
32122 }
32123 
32124 /// Targets can use this to indicate that they only support *some*
32125 /// VECTOR_SHUFFLE operations, those with specific masks.
32126 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
32127 /// are assumed to be legal.
32128 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
32129   if (!VT.isSimple())
32130     return false;
32131 
32132   // Not for i1 vectors
32133   if (VT.getSimpleVT().getScalarType() == MVT::i1)
32134     return false;
32135 
32136   // Very little shuffling can be done for 64-bit vectors right now.
32137   if (VT.getSimpleVT().getSizeInBits() == 64)
32138     return false;
32139 
32140   // We only care that the types being shuffled are legal. The lowering can
32141   // handle any possible shuffle mask that results.
32142   return isTypeLegal(VT.getSimpleVT());
32143 }
32144 
32145 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
32146                                                EVT VT) const {
32147   // Don't convert an 'and' into a shuffle that we don't directly support.
32148   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
32149   if (!Subtarget.hasAVX2())
32150     if (VT == MVT::v32i8 || VT == MVT::v16i16)
32151       return false;
32152 
32153   // Just delegate to the generic legality, clear masks aren't special.
32154   return isShuffleMaskLegal(Mask, VT);
32155 }
32156 
32157 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
32158   // If the subtarget is using thunks, we need to not generate jump tables.
32159   if (Subtarget.useIndirectThunkBranches())
32160     return false;
32161 
32162   // Otherwise, fallback on the generic logic.
32163   return TargetLowering::areJTsAllowed(Fn);
32164 }
32165 
32166 //===----------------------------------------------------------------------===//
32167 //                           X86 Scheduler Hooks
32168 //===----------------------------------------------------------------------===//
32169 
32170 // Returns true if EFLAG is consumed after this iterator in the rest of the
32171 // basic block or any successors of the basic block.
32172 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
32173                               MachineBasicBlock *BB) {
32174   // Scan forward through BB for a use/def of EFLAGS.
32175   for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
32176          miI != miE; ++miI) {
32177     const MachineInstr& mi = *miI;
32178     if (mi.readsRegister(X86::EFLAGS))
32179       return true;
32180     // If we found a def, we can stop searching.
32181     if (mi.definesRegister(X86::EFLAGS))
32182       return false;
32183   }
32184 
32185   // If we hit the end of the block, check whether EFLAGS is live into a
32186   // successor.
32187   for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
32188                                         sEnd = BB->succ_end();
32189        sItr != sEnd; ++sItr) {
32190     MachineBasicBlock* succ = *sItr;
32191     if (succ->isLiveIn(X86::EFLAGS))
32192       return true;
32193   }
32194 
32195   return false;
32196 }
32197 
32198 /// Utility function to emit xbegin specifying the start of an RTM region.
32199 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32200                                      const TargetInstrInfo *TII) {
32201   const DebugLoc &DL = MI.getDebugLoc();
32202 
32203   const BasicBlock *BB = MBB->getBasicBlock();
32204   MachineFunction::iterator I = ++MBB->getIterator();
32205 
32206   // For the v = xbegin(), we generate
32207   //
32208   // thisMBB:
32209   //  xbegin sinkMBB
32210   //
32211   // mainMBB:
32212   //  s0 = -1
32213   //
32214   // fallBB:
32215   //  eax = # XABORT_DEF
32216   //  s1 = eax
32217   //
32218   // sinkMBB:
32219   //  v = phi(s0/mainBB, s1/fallBB)
32220 
32221   MachineBasicBlock *thisMBB = MBB;
32222   MachineFunction *MF = MBB->getParent();
32223   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32224   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32225   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32226   MF->insert(I, mainMBB);
32227   MF->insert(I, fallMBB);
32228   MF->insert(I, sinkMBB);
32229 
32230   if (isEFLAGSLiveAfter(MI, MBB)) {
32231     mainMBB->addLiveIn(X86::EFLAGS);
32232     fallMBB->addLiveIn(X86::EFLAGS);
32233     sinkMBB->addLiveIn(X86::EFLAGS);
32234   }
32235 
32236   // Transfer the remainder of BB and its successor edges to sinkMBB.
32237   sinkMBB->splice(sinkMBB->begin(), MBB,
32238                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32239   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32240 
32241   MachineRegisterInfo &MRI = MF->getRegInfo();
32242   Register DstReg = MI.getOperand(0).getReg();
32243   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32244   Register mainDstReg = MRI.createVirtualRegister(RC);
32245   Register fallDstReg = MRI.createVirtualRegister(RC);
32246 
32247   // thisMBB:
32248   //  xbegin fallMBB
32249   //  # fallthrough to mainMBB
32250   //  # abortion to fallMBB
32251   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32252   thisMBB->addSuccessor(mainMBB);
32253   thisMBB->addSuccessor(fallMBB);
32254 
32255   // mainMBB:
32256   //  mainDstReg := -1
32257   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32258   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32259   mainMBB->addSuccessor(sinkMBB);
32260 
32261   // fallMBB:
32262   //  ; pseudo instruction to model hardware's definition from XABORT
32263   //  EAX := XABORT_DEF
32264   //  fallDstReg := EAX
32265   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32266   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32267       .addReg(X86::EAX);
32268   fallMBB->addSuccessor(sinkMBB);
32269 
32270   // sinkMBB:
32271   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32272   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32273       .addReg(mainDstReg).addMBB(mainMBB)
32274       .addReg(fallDstReg).addMBB(fallMBB);
32275 
32276   MI.eraseFromParent();
32277   return sinkMBB;
32278 }
32279 
32280 MachineBasicBlock *
32281 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32282                                                MachineBasicBlock *MBB) const {
32283   // Emit va_arg instruction on X86-64.
32284 
32285   // Operands to this pseudo-instruction:
32286   // 0  ) Output        : destination address (reg)
32287   // 1-5) Input         : va_list address (addr, i64mem)
32288   // 6  ) ArgSize       : Size (in bytes) of vararg type
32289   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32290   // 8  ) Align         : Alignment of type
32291   // 9  ) EFLAGS (implicit-def)
32292 
32293   assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
32294   static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32295 
32296   Register DestReg = MI.getOperand(0).getReg();
32297   MachineOperand &Base = MI.getOperand(1);
32298   MachineOperand &Scale = MI.getOperand(2);
32299   MachineOperand &Index = MI.getOperand(3);
32300   MachineOperand &Disp = MI.getOperand(4);
32301   MachineOperand &Segment = MI.getOperand(5);
32302   unsigned ArgSize = MI.getOperand(6).getImm();
32303   unsigned ArgMode = MI.getOperand(7).getImm();
32304   Align Alignment = Align(MI.getOperand(8).getImm());
32305 
32306   MachineFunction *MF = MBB->getParent();
32307 
32308   // Memory Reference
32309   assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
32310 
32311   MachineMemOperand *OldMMO = MI.memoperands().front();
32312 
32313   // Clone the MMO into two separate MMOs for loading and storing
32314   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32315       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32316   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32317       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32318 
32319   // Machine Information
32320   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32321   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32322   const TargetRegisterClass *AddrRegClass =
32323       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32324   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32325   const DebugLoc &DL = MI.getDebugLoc();
32326 
32327   // struct va_list {
32328   //   i32   gp_offset
32329   //   i32   fp_offset
32330   //   i64   overflow_area (address)
32331   //   i64   reg_save_area (address)
32332   // }
32333   // sizeof(va_list) = 24
32334   // alignment(va_list) = 8
32335 
32336   unsigned TotalNumIntRegs = 6;
32337   unsigned TotalNumXMMRegs = 8;
32338   bool UseGPOffset = (ArgMode == 1);
32339   bool UseFPOffset = (ArgMode == 2);
32340   unsigned MaxOffset = TotalNumIntRegs * 8 +
32341                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32342 
32343   /* Align ArgSize to a multiple of 8 */
32344   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32345   bool NeedsAlign = (Alignment > 8);
32346 
32347   MachineBasicBlock *thisMBB = MBB;
32348   MachineBasicBlock *overflowMBB;
32349   MachineBasicBlock *offsetMBB;
32350   MachineBasicBlock *endMBB;
32351 
32352   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
32353   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
32354   unsigned OffsetReg = 0;
32355 
32356   if (!UseGPOffset && !UseFPOffset) {
32357     // If we only pull from the overflow region, we don't create a branch.
32358     // We don't need to alter control flow.
32359     OffsetDestReg = 0; // unused
32360     OverflowDestReg = DestReg;
32361 
32362     offsetMBB = nullptr;
32363     overflowMBB = thisMBB;
32364     endMBB = thisMBB;
32365   } else {
32366     // First emit code to check if gp_offset (or fp_offset) is below the bound.
32367     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
32368     // If not, pull from overflow_area. (branch to overflowMBB)
32369     //
32370     //       thisMBB
32371     //         |     .
32372     //         |        .
32373     //     offsetMBB   overflowMBB
32374     //         |        .
32375     //         |     .
32376     //        endMBB
32377 
32378     // Registers for the PHI in endMBB
32379     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
32380     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
32381 
32382     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32383     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32384     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32385     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32386 
32387     MachineFunction::iterator MBBIter = ++MBB->getIterator();
32388 
32389     // Insert the new basic blocks
32390     MF->insert(MBBIter, offsetMBB);
32391     MF->insert(MBBIter, overflowMBB);
32392     MF->insert(MBBIter, endMBB);
32393 
32394     // Transfer the remainder of MBB and its successor edges to endMBB.
32395     endMBB->splice(endMBB->begin(), thisMBB,
32396                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
32397     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
32398 
32399     // Make offsetMBB and overflowMBB successors of thisMBB
32400     thisMBB->addSuccessor(offsetMBB);
32401     thisMBB->addSuccessor(overflowMBB);
32402 
32403     // endMBB is a successor of both offsetMBB and overflowMBB
32404     offsetMBB->addSuccessor(endMBB);
32405     overflowMBB->addSuccessor(endMBB);
32406 
32407     // Load the offset value into a register
32408     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32409     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
32410         .add(Base)
32411         .add(Scale)
32412         .add(Index)
32413         .addDisp(Disp, UseFPOffset ? 4 : 0)
32414         .add(Segment)
32415         .setMemRefs(LoadOnlyMMO);
32416 
32417     // Check if there is enough room left to pull this argument.
32418     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
32419       .addReg(OffsetReg)
32420       .addImm(MaxOffset + 8 - ArgSizeA8);
32421 
32422     // Branch to "overflowMBB" if offset >= max
32423     // Fall through to "offsetMBB" otherwise
32424     BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
32425       .addMBB(overflowMBB).addImm(X86::COND_AE);
32426   }
32427 
32428   // In offsetMBB, emit code to use the reg_save_area.
32429   if (offsetMBB) {
32430     assert(OffsetReg != 0);
32431 
32432     // Read the reg_save_area address.
32433     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
32434     BuildMI(
32435         offsetMBB, DL,
32436         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32437         RegSaveReg)
32438         .add(Base)
32439         .add(Scale)
32440         .add(Index)
32441         .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
32442         .add(Segment)
32443         .setMemRefs(LoadOnlyMMO);
32444 
32445     if (Subtarget.isTarget64BitLP64()) {
32446       // Zero-extend the offset
32447       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
32448       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
32449           .addImm(0)
32450           .addReg(OffsetReg)
32451           .addImm(X86::sub_32bit);
32452 
32453       // Add the offset to the reg_save_area to get the final address.
32454       BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
32455           .addReg(OffsetReg64)
32456           .addReg(RegSaveReg);
32457     } else {
32458       // Add the offset to the reg_save_area to get the final address.
32459       BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
32460           .addReg(OffsetReg)
32461           .addReg(RegSaveReg);
32462     }
32463 
32464     // Compute the offset for the next argument
32465     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32466     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
32467       .addReg(OffsetReg)
32468       .addImm(UseFPOffset ? 16 : 8);
32469 
32470     // Store it back into the va_list.
32471     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
32472         .add(Base)
32473         .add(Scale)
32474         .add(Index)
32475         .addDisp(Disp, UseFPOffset ? 4 : 0)
32476         .add(Segment)
32477         .addReg(NextOffsetReg)
32478         .setMemRefs(StoreOnlyMMO);
32479 
32480     // Jump to endMBB
32481     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
32482       .addMBB(endMBB);
32483   }
32484 
32485   //
32486   // Emit code to use overflow area
32487   //
32488 
32489   // Load the overflow_area address into a register.
32490   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
32491   BuildMI(overflowMBB, DL,
32492           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32493           OverflowAddrReg)
32494       .add(Base)
32495       .add(Scale)
32496       .add(Index)
32497       .addDisp(Disp, 8)
32498       .add(Segment)
32499       .setMemRefs(LoadOnlyMMO);
32500 
32501   // If we need to align it, do so. Otherwise, just copy the address
32502   // to OverflowDestReg.
32503   if (NeedsAlign) {
32504     // Align the overflow address
32505     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
32506 
32507     // aligned_addr = (addr + (align-1)) & ~(align-1)
32508     BuildMI(
32509         overflowMBB, DL,
32510         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32511         TmpReg)
32512         .addReg(OverflowAddrReg)
32513         .addImm(Alignment.value() - 1);
32514 
32515     BuildMI(
32516         overflowMBB, DL,
32517         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
32518         OverflowDestReg)
32519         .addReg(TmpReg)
32520         .addImm(~(uint64_t)(Alignment.value() - 1));
32521   } else {
32522     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
32523       .addReg(OverflowAddrReg);
32524   }
32525 
32526   // Compute the next overflow address after this argument.
32527   // (the overflow address should be kept 8-byte aligned)
32528   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
32529   BuildMI(
32530       overflowMBB, DL,
32531       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32532       NextAddrReg)
32533       .addReg(OverflowDestReg)
32534       .addImm(ArgSizeA8);
32535 
32536   // Store the new overflow address.
32537   BuildMI(overflowMBB, DL,
32538           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
32539       .add(Base)
32540       .add(Scale)
32541       .add(Index)
32542       .addDisp(Disp, 8)
32543       .add(Segment)
32544       .addReg(NextAddrReg)
32545       .setMemRefs(StoreOnlyMMO);
32546 
32547   // If we branched, emit the PHI to the front of endMBB.
32548   if (offsetMBB) {
32549     BuildMI(*endMBB, endMBB->begin(), DL,
32550             TII->get(X86::PHI), DestReg)
32551       .addReg(OffsetDestReg).addMBB(offsetMBB)
32552       .addReg(OverflowDestReg).addMBB(overflowMBB);
32553   }
32554 
32555   // Erase the pseudo instruction
32556   MI.eraseFromParent();
32557 
32558   return endMBB;
32559 }
32560 
32561 // The EFLAGS operand of SelectItr might be missing a kill marker
32562 // because there were multiple uses of EFLAGS, and ISel didn't know
32563 // which to mark. Figure out whether SelectItr should have had a
32564 // kill marker, and set it if it should. Returns the correct kill
32565 // marker value.
32566 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
32567                                      MachineBasicBlock* BB,
32568                                      const TargetRegisterInfo* TRI) {
32569   if (isEFLAGSLiveAfter(SelectItr, BB))
32570     return false;
32571 
32572   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
32573   // out. SelectMI should have a kill flag on EFLAGS.
32574   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
32575   return true;
32576 }
32577 
32578 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
32579 // together with other CMOV pseudo-opcodes into a single basic-block with
32580 // conditional jump around it.
32581 static bool isCMOVPseudo(MachineInstr &MI) {
32582   switch (MI.getOpcode()) {
32583   case X86::CMOV_FR32:
32584   case X86::CMOV_FR32X:
32585   case X86::CMOV_FR64:
32586   case X86::CMOV_FR64X:
32587   case X86::CMOV_GR8:
32588   case X86::CMOV_GR16:
32589   case X86::CMOV_GR32:
32590   case X86::CMOV_RFP32:
32591   case X86::CMOV_RFP64:
32592   case X86::CMOV_RFP80:
32593   case X86::CMOV_VR64:
32594   case X86::CMOV_VR128:
32595   case X86::CMOV_VR128X:
32596   case X86::CMOV_VR256:
32597   case X86::CMOV_VR256X:
32598   case X86::CMOV_VR512:
32599   case X86::CMOV_VK1:
32600   case X86::CMOV_VK2:
32601   case X86::CMOV_VK4:
32602   case X86::CMOV_VK8:
32603   case X86::CMOV_VK16:
32604   case X86::CMOV_VK32:
32605   case X86::CMOV_VK64:
32606     return true;
32607 
32608   default:
32609     return false;
32610   }
32611 }
32612 
32613 // Helper function, which inserts PHI functions into SinkMBB:
32614 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
32615 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
32616 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
32617 // the last PHI function inserted.
32618 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
32619     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
32620     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
32621     MachineBasicBlock *SinkMBB) {
32622   MachineFunction *MF = TrueMBB->getParent();
32623   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
32624   const DebugLoc &DL = MIItBegin->getDebugLoc();
32625 
32626   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
32627   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32628 
32629   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
32630 
32631   // As we are creating the PHIs, we have to be careful if there is more than
32632   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
32633   // PHIs have to reference the individual true/false inputs from earlier PHIs.
32634   // That also means that PHI construction must work forward from earlier to
32635   // later, and that the code must maintain a mapping from earlier PHI's
32636   // destination registers, and the registers that went into the PHI.
32637   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
32638   MachineInstrBuilder MIB;
32639 
32640   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
32641     Register DestReg = MIIt->getOperand(0).getReg();
32642     Register Op1Reg = MIIt->getOperand(1).getReg();
32643     Register Op2Reg = MIIt->getOperand(2).getReg();
32644 
32645     // If this CMOV we are generating is the opposite condition from
32646     // the jump we generated, then we have to swap the operands for the
32647     // PHI that is going to be generated.
32648     if (MIIt->getOperand(3).getImm() == OppCC)
32649       std::swap(Op1Reg, Op2Reg);
32650 
32651     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
32652       Op1Reg = RegRewriteTable[Op1Reg].first;
32653 
32654     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
32655       Op2Reg = RegRewriteTable[Op2Reg].second;
32656 
32657     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
32658               .addReg(Op1Reg)
32659               .addMBB(FalseMBB)
32660               .addReg(Op2Reg)
32661               .addMBB(TrueMBB);
32662 
32663     // Add this PHI to the rewrite table.
32664     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
32665   }
32666 
32667   return MIB;
32668 }
32669 
32670 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
32671 MachineBasicBlock *
32672 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
32673                                              MachineInstr &SecondCascadedCMOV,
32674                                              MachineBasicBlock *ThisMBB) const {
32675   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32676   const DebugLoc &DL = FirstCMOV.getDebugLoc();
32677 
32678   // We lower cascaded CMOVs such as
32679   //
32680   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32681   //
32682   // to two successive branches.
32683   //
32684   // Without this, we would add a PHI between the two jumps, which ends up
32685   // creating a few copies all around. For instance, for
32686   //
32687   //    (sitofp (zext (fcmp une)))
32688   //
32689   // we would generate:
32690   //
32691   //         ucomiss %xmm1, %xmm0
32692   //         movss  <1.0f>, %xmm0
32693   //         movaps  %xmm0, %xmm1
32694   //         jne     .LBB5_2
32695   //         xorps   %xmm1, %xmm1
32696   // .LBB5_2:
32697   //         jp      .LBB5_4
32698   //         movaps  %xmm1, %xmm0
32699   // .LBB5_4:
32700   //         retq
32701   //
32702   // because this custom-inserter would have generated:
32703   //
32704   //   A
32705   //   | \
32706   //   |  B
32707   //   | /
32708   //   C
32709   //   | \
32710   //   |  D
32711   //   | /
32712   //   E
32713   //
32714   // A: X = ...; Y = ...
32715   // B: empty
32716   // C: Z = PHI [X, A], [Y, B]
32717   // D: empty
32718   // E: PHI [X, C], [Z, D]
32719   //
32720   // If we lower both CMOVs in a single step, we can instead generate:
32721   //
32722   //   A
32723   //   | \
32724   //   |  C
32725   //   | /|
32726   //   |/ |
32727   //   |  |
32728   //   |  D
32729   //   | /
32730   //   E
32731   //
32732   // A: X = ...; Y = ...
32733   // D: empty
32734   // E: PHI [X, A], [X, C], [Y, D]
32735   //
32736   // Which, in our sitofp/fcmp example, gives us something like:
32737   //
32738   //         ucomiss %xmm1, %xmm0
32739   //         movss  <1.0f>, %xmm0
32740   //         jne     .LBB5_4
32741   //         jp      .LBB5_4
32742   //         xorps   %xmm0, %xmm0
32743   // .LBB5_4:
32744   //         retq
32745   //
32746 
32747   // We lower cascaded CMOV into two successive branches to the same block.
32748   // EFLAGS is used by both, so mark it as live in the second.
32749   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32750   MachineFunction *F = ThisMBB->getParent();
32751   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32752   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32753   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32754 
32755   MachineFunction::iterator It = ++ThisMBB->getIterator();
32756   F->insert(It, FirstInsertedMBB);
32757   F->insert(It, SecondInsertedMBB);
32758   F->insert(It, SinkMBB);
32759 
32760   // For a cascaded CMOV, we lower it to two successive branches to
32761   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
32762   // the FirstInsertedMBB.
32763   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32764 
32765   // If the EFLAGS register isn't dead in the terminator, then claim that it's
32766   // live into the sink and copy blocks.
32767   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32768   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32769       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32770     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32771     SinkMBB->addLiveIn(X86::EFLAGS);
32772   }
32773 
32774   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32775   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32776                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
32777                   ThisMBB->end());
32778   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32779 
32780   // Fallthrough block for ThisMBB.
32781   ThisMBB->addSuccessor(FirstInsertedMBB);
32782   // The true block target of the first branch is always SinkMBB.
32783   ThisMBB->addSuccessor(SinkMBB);
32784   // Fallthrough block for FirstInsertedMBB.
32785   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32786   // The true block for the branch of FirstInsertedMBB.
32787   FirstInsertedMBB->addSuccessor(SinkMBB);
32788   // This is fallthrough.
32789   SecondInsertedMBB->addSuccessor(SinkMBB);
32790 
32791   // Create the conditional branch instructions.
32792   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32793   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32794 
32795   X86::CondCode SecondCC =
32796       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32797   BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32798 
32799   //  SinkMBB:
32800   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32801   Register DestReg = FirstCMOV.getOperand(0).getReg();
32802   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32803   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32804   MachineInstrBuilder MIB =
32805       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32806           .addReg(Op1Reg)
32807           .addMBB(SecondInsertedMBB)
32808           .addReg(Op2Reg)
32809           .addMBB(ThisMBB);
32810 
32811   // The second SecondInsertedMBB provides the same incoming value as the
32812   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32813   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32814   // Copy the PHI result to the register defined by the second CMOV.
32815   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32816           TII->get(TargetOpcode::COPY),
32817           SecondCascadedCMOV.getOperand(0).getReg())
32818       .addReg(FirstCMOV.getOperand(0).getReg());
32819 
32820   // Now remove the CMOVs.
32821   FirstCMOV.eraseFromParent();
32822   SecondCascadedCMOV.eraseFromParent();
32823 
32824   return SinkMBB;
32825 }
32826 
32827 MachineBasicBlock *
32828 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32829                                      MachineBasicBlock *ThisMBB) const {
32830   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32831   const DebugLoc &DL = MI.getDebugLoc();
32832 
32833   // To "insert" a SELECT_CC instruction, we actually have to insert the
32834   // diamond control-flow pattern.  The incoming instruction knows the
32835   // destination vreg to set, the condition code register to branch on, the
32836   // true/false values to select between and a branch opcode to use.
32837 
32838   //  ThisMBB:
32839   //  ...
32840   //   TrueVal = ...
32841   //   cmpTY ccX, r1, r2
32842   //   bCC copy1MBB
32843   //   fallthrough --> FalseMBB
32844 
32845   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32846   // as described above, by inserting a BB, and then making a PHI at the join
32847   // point to select the true and false operands of the CMOV in the PHI.
32848   //
32849   // The code also handles two different cases of multiple CMOV opcodes
32850   // in a row.
32851   //
32852   // Case 1:
32853   // In this case, there are multiple CMOVs in a row, all which are based on
32854   // the same condition setting (or the exact opposite condition setting).
32855   // In this case we can lower all the CMOVs using a single inserted BB, and
32856   // then make a number of PHIs at the join point to model the CMOVs. The only
32857   // trickiness here, is that in a case like:
32858   //
32859   // t2 = CMOV cond1 t1, f1
32860   // t3 = CMOV cond1 t2, f2
32861   //
32862   // when rewriting this into PHIs, we have to perform some renaming on the
32863   // temps since you cannot have a PHI operand refer to a PHI result earlier
32864   // in the same block.  The "simple" but wrong lowering would be:
32865   //
32866   // t2 = PHI t1(BB1), f1(BB2)
32867   // t3 = PHI t2(BB1), f2(BB2)
32868   //
32869   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32870   // renaming is to note that on the path through BB1, t2 is really just a
32871   // copy of t1, and do that renaming, properly generating:
32872   //
32873   // t2 = PHI t1(BB1), f1(BB2)
32874   // t3 = PHI t1(BB1), f2(BB2)
32875   //
32876   // Case 2:
32877   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32878   // function - EmitLoweredCascadedSelect.
32879 
32880   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32881   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32882   MachineInstr *LastCMOV = &MI;
32883   MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32884 
32885   // Check for case 1, where there are multiple CMOVs with the same condition
32886   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
32887   // number of jumps the most.
32888 
32889   if (isCMOVPseudo(MI)) {
32890     // See if we have a string of CMOVS with the same condition. Skip over
32891     // intervening debug insts.
32892     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32893            (NextMIIt->getOperand(3).getImm() == CC ||
32894             NextMIIt->getOperand(3).getImm() == OppCC)) {
32895       LastCMOV = &*NextMIIt;
32896       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32897     }
32898   }
32899 
32900   // This checks for case 2, but only do this if we didn't already find
32901   // case 1, as indicated by LastCMOV == MI.
32902   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32903       NextMIIt->getOpcode() == MI.getOpcode() &&
32904       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32905       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32906       NextMIIt->getOperand(1).isKill()) {
32907     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32908   }
32909 
32910   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32911   MachineFunction *F = ThisMBB->getParent();
32912   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32913   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32914 
32915   MachineFunction::iterator It = ++ThisMBB->getIterator();
32916   F->insert(It, FalseMBB);
32917   F->insert(It, SinkMBB);
32918 
32919   // If the EFLAGS register isn't dead in the terminator, then claim that it's
32920   // live into the sink and copy blocks.
32921   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32922   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32923       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32924     FalseMBB->addLiveIn(X86::EFLAGS);
32925     SinkMBB->addLiveIn(X86::EFLAGS);
32926   }
32927 
32928   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32929   auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32930   auto DbgIt = MachineBasicBlock::iterator(MI);
32931   while (DbgIt != DbgEnd) {
32932     auto Next = std::next(DbgIt);
32933     if (DbgIt->isDebugInstr())
32934       SinkMBB->push_back(DbgIt->removeFromParent());
32935     DbgIt = Next;
32936   }
32937 
32938   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32939   SinkMBB->splice(SinkMBB->end(), ThisMBB,
32940                   std::next(MachineBasicBlock::iterator(LastCMOV)),
32941                   ThisMBB->end());
32942   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32943 
32944   // Fallthrough block for ThisMBB.
32945   ThisMBB->addSuccessor(FalseMBB);
32946   // The true block target of the first (or only) branch is always a SinkMBB.
32947   ThisMBB->addSuccessor(SinkMBB);
32948   // Fallthrough block for FalseMBB.
32949   FalseMBB->addSuccessor(SinkMBB);
32950 
32951   // Create the conditional branch instruction.
32952   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32953 
32954   //  SinkMBB:
32955   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32956   //  ...
32957   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32958   MachineBasicBlock::iterator MIItEnd =
32959       std::next(MachineBasicBlock::iterator(LastCMOV));
32960   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32961 
32962   // Now remove the CMOV(s).
32963   ThisMBB->erase(MIItBegin, MIItEnd);
32964 
32965   return SinkMBB;
32966 }
32967 
32968 static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32969   if (IsLP64) {
32970     if (isInt<8>(Imm))
32971       return X86::SUB64ri8;
32972     return X86::SUB64ri32;
32973   } else {
32974     if (isInt<8>(Imm))
32975       return X86::SUB32ri8;
32976     return X86::SUB32ri;
32977   }
32978 }
32979 
32980 MachineBasicBlock *
32981 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32982                                            MachineBasicBlock *MBB) const {
32983   MachineFunction *MF = MBB->getParent();
32984   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32985   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32986   const DebugLoc &DL = MI.getDebugLoc();
32987   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32988 
32989   const unsigned ProbeSize = getStackProbeSize(*MF);
32990 
32991   MachineRegisterInfo &MRI = MF->getRegInfo();
32992   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32993   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32994   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32995 
32996   MachineFunction::iterator MBBIter = ++MBB->getIterator();
32997   MF->insert(MBBIter, testMBB);
32998   MF->insert(MBBIter, blockMBB);
32999   MF->insert(MBBIter, tailMBB);
33000 
33001   Register sizeVReg = MI.getOperand(1).getReg();
33002 
33003   Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
33004 
33005   Register TmpStackPtr = MRI.createVirtualRegister(
33006       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33007   Register FinalStackPtr = MRI.createVirtualRegister(
33008       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33009 
33010   BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
33011       .addReg(physSPReg);
33012   {
33013     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
33014     BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
33015         .addReg(TmpStackPtr)
33016         .addReg(sizeVReg);
33017   }
33018 
33019   // test rsp size
33020 
33021   BuildMI(testMBB, DL,
33022           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
33023       .addReg(FinalStackPtr)
33024       .addReg(physSPReg);
33025 
33026   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
33027       .addMBB(tailMBB)
33028       .addImm(X86::COND_GE);
33029   testMBB->addSuccessor(blockMBB);
33030   testMBB->addSuccessor(tailMBB);
33031 
33032   // Touch the block then extend it. This is done on the opposite side of
33033   // static probe where we allocate then touch, to avoid the need of probing the
33034   // tail of the static alloca. Possible scenarios are:
33035   //
33036   //       + ---- <- ------------ <- ------------- <- ------------ +
33037   //       |                                                       |
33038   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
33039   //                                                               |                                                               |
33040   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
33041   //
33042   // The property we want to enforce is to never have more than [page alloc] between two probes.
33043 
33044   const unsigned XORMIOpc =
33045       TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
33046   addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
33047       .addImm(0);
33048 
33049   BuildMI(blockMBB, DL,
33050           TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
33051       .addReg(physSPReg)
33052       .addImm(ProbeSize);
33053 
33054 
33055   BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
33056   blockMBB->addSuccessor(testMBB);
33057 
33058   // Replace original instruction by the expected stack ptr
33059   BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
33060       .addReg(FinalStackPtr);
33061 
33062   tailMBB->splice(tailMBB->end(), MBB,
33063                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33064   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
33065   MBB->addSuccessor(testMBB);
33066 
33067   // Delete the original pseudo instruction.
33068   MI.eraseFromParent();
33069 
33070   // And we're done.
33071   return tailMBB;
33072 }
33073 
33074 MachineBasicBlock *
33075 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
33076                                         MachineBasicBlock *BB) const {
33077   MachineFunction *MF = BB->getParent();
33078   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33079   const DebugLoc &DL = MI.getDebugLoc();
33080   const BasicBlock *LLVM_BB = BB->getBasicBlock();
33081 
33082   assert(MF->shouldSplitStack());
33083 
33084   const bool Is64Bit = Subtarget.is64Bit();
33085   const bool IsLP64 = Subtarget.isTarget64BitLP64();
33086 
33087   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
33088   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
33089 
33090   // BB:
33091   //  ... [Till the alloca]
33092   // If stacklet is not large enough, jump to mallocMBB
33093   //
33094   // bumpMBB:
33095   //  Allocate by subtracting from RSP
33096   //  Jump to continueMBB
33097   //
33098   // mallocMBB:
33099   //  Allocate by call to runtime
33100   //
33101   // continueMBB:
33102   //  ...
33103   //  [rest of original BB]
33104   //
33105 
33106   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33107   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33108   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33109 
33110   MachineRegisterInfo &MRI = MF->getRegInfo();
33111   const TargetRegisterClass *AddrRegClass =
33112       getRegClassFor(getPointerTy(MF->getDataLayout()));
33113 
33114   Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33115            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33116            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
33117            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
33118            sizeVReg = MI.getOperand(1).getReg(),
33119            physSPReg =
33120                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
33121 
33122   MachineFunction::iterator MBBIter = ++BB->getIterator();
33123 
33124   MF->insert(MBBIter, bumpMBB);
33125   MF->insert(MBBIter, mallocMBB);
33126   MF->insert(MBBIter, continueMBB);
33127 
33128   continueMBB->splice(continueMBB->begin(), BB,
33129                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
33130   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
33131 
33132   // Add code to the main basic block to check if the stack limit has been hit,
33133   // and if so, jump to mallocMBB otherwise to bumpMBB.
33134   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
33135   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
33136     .addReg(tmpSPVReg).addReg(sizeVReg);
33137   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
33138     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
33139     .addReg(SPLimitVReg);
33140   BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
33141 
33142   // bumpMBB simply decreases the stack pointer, since we know the current
33143   // stacklet has enough space.
33144   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
33145     .addReg(SPLimitVReg);
33146   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
33147     .addReg(SPLimitVReg);
33148   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33149 
33150   // Calls into a routine in libgcc to allocate more space from the heap.
33151   const uint32_t *RegMask =
33152       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
33153   if (IsLP64) {
33154     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
33155       .addReg(sizeVReg);
33156     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33157       .addExternalSymbol("__morestack_allocate_stack_space")
33158       .addRegMask(RegMask)
33159       .addReg(X86::RDI, RegState::Implicit)
33160       .addReg(X86::RAX, RegState::ImplicitDefine);
33161   } else if (Is64Bit) {
33162     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
33163       .addReg(sizeVReg);
33164     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33165       .addExternalSymbol("__morestack_allocate_stack_space")
33166       .addRegMask(RegMask)
33167       .addReg(X86::EDI, RegState::Implicit)
33168       .addReg(X86::EAX, RegState::ImplicitDefine);
33169   } else {
33170     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
33171       .addImm(12);
33172     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
33173     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
33174       .addExternalSymbol("__morestack_allocate_stack_space")
33175       .addRegMask(RegMask)
33176       .addReg(X86::EAX, RegState::ImplicitDefine);
33177   }
33178 
33179   if (!Is64Bit)
33180     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
33181       .addImm(16);
33182 
33183   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
33184     .addReg(IsLP64 ? X86::RAX : X86::EAX);
33185   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33186 
33187   // Set up the CFG correctly.
33188   BB->addSuccessor(bumpMBB);
33189   BB->addSuccessor(mallocMBB);
33190   mallocMBB->addSuccessor(continueMBB);
33191   bumpMBB->addSuccessor(continueMBB);
33192 
33193   // Take care of the PHI nodes.
33194   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33195           MI.getOperand(0).getReg())
33196       .addReg(mallocPtrVReg)
33197       .addMBB(mallocMBB)
33198       .addReg(bumpSPPtrVReg)
33199       .addMBB(bumpMBB);
33200 
33201   // Delete the original pseudo instruction.
33202   MI.eraseFromParent();
33203 
33204   // And we're done.
33205   return continueMBB;
33206 }
33207 
33208 MachineBasicBlock *
33209 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33210                                        MachineBasicBlock *BB) const {
33211   MachineFunction *MF = BB->getParent();
33212   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33213   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33214   const DebugLoc &DL = MI.getDebugLoc();
33215 
33216   assert(!isAsynchronousEHPersonality(
33217              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
33218          "SEH does not use catchret!");
33219 
33220   // Only 32-bit EH needs to worry about manually restoring stack pointers.
33221   if (!Subtarget.is32Bit())
33222     return BB;
33223 
33224   // C++ EH creates a new target block to hold the restore code, and wires up
33225   // the new block to the return destination with a normal JMP_4.
33226   MachineBasicBlock *RestoreMBB =
33227       MF->CreateMachineBasicBlock(BB->getBasicBlock());
33228   assert(BB->succ_size() == 1);
33229   MF->insert(std::next(BB->getIterator()), RestoreMBB);
33230   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33231   BB->addSuccessor(RestoreMBB);
33232   MI.getOperand(0).setMBB(RestoreMBB);
33233 
33234   // Marking this as an EH pad but not a funclet entry block causes PEI to
33235   // restore stack pointers in the block.
33236   RestoreMBB->setIsEHPad(true);
33237 
33238   auto RestoreMBBI = RestoreMBB->begin();
33239   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33240   return BB;
33241 }
33242 
33243 MachineBasicBlock *
33244 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33245                                       MachineBasicBlock *BB) const {
33246   // So, here we replace TLSADDR with the sequence:
33247   // adjust_stackdown -> TLSADDR -> adjust_stackup.
33248   // We need this because TLSADDR is lowered into calls
33249   // inside MC, therefore without the two markers shrink-wrapping
33250   // may push the prologue/epilogue pass them.
33251   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33252   const DebugLoc &DL = MI.getDebugLoc();
33253   MachineFunction &MF = *BB->getParent();
33254 
33255   // Emit CALLSEQ_START right before the instruction.
33256   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33257   MachineInstrBuilder CallseqStart =
33258     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33259   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33260 
33261   // Emit CALLSEQ_END right after the instruction.
33262   // We don't call erase from parent because we want to keep the
33263   // original instruction around.
33264   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33265   MachineInstrBuilder CallseqEnd =
33266     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33267   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33268 
33269   return BB;
33270 }
33271 
33272 MachineBasicBlock *
33273 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33274                                       MachineBasicBlock *BB) const {
33275   // This is pretty easy.  We're taking the value that we received from
33276   // our load from the relocation, sticking it in either RDI (x86-64)
33277   // or EAX and doing an indirect call.  The return value will then
33278   // be in the normal return register.
33279   MachineFunction *F = BB->getParent();
33280   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33281   const DebugLoc &DL = MI.getDebugLoc();
33282 
33283   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
33284   assert(MI.getOperand(3).isGlobal() && "This should be a global");
33285 
33286   // Get a register mask for the lowered call.
33287   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33288   // proper register mask.
33289   const uint32_t *RegMask =
33290       Subtarget.is64Bit() ?
33291       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33292       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33293   if (Subtarget.is64Bit()) {
33294     MachineInstrBuilder MIB =
33295         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33296             .addReg(X86::RIP)
33297             .addImm(0)
33298             .addReg(0)
33299             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33300                               MI.getOperand(3).getTargetFlags())
33301             .addReg(0);
33302     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33303     addDirectMem(MIB, X86::RDI);
33304     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33305   } else if (!isPositionIndependent()) {
33306     MachineInstrBuilder MIB =
33307         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33308             .addReg(0)
33309             .addImm(0)
33310             .addReg(0)
33311             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33312                               MI.getOperand(3).getTargetFlags())
33313             .addReg(0);
33314     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33315     addDirectMem(MIB, X86::EAX);
33316     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33317   } else {
33318     MachineInstrBuilder MIB =
33319         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33320             .addReg(TII->getGlobalBaseReg(F))
33321             .addImm(0)
33322             .addReg(0)
33323             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33324                               MI.getOperand(3).getTargetFlags())
33325             .addReg(0);
33326     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33327     addDirectMem(MIB, X86::EAX);
33328     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33329   }
33330 
33331   MI.eraseFromParent(); // The pseudo instruction is gone now.
33332   return BB;
33333 }
33334 
33335 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33336   switch (RPOpc) {
33337   case X86::INDIRECT_THUNK_CALL32:
33338     return X86::CALLpcrel32;
33339   case X86::INDIRECT_THUNK_CALL64:
33340     return X86::CALL64pcrel32;
33341   case X86::INDIRECT_THUNK_TCRETURN32:
33342     return X86::TCRETURNdi;
33343   case X86::INDIRECT_THUNK_TCRETURN64:
33344     return X86::TCRETURNdi64;
33345   }
33346   llvm_unreachable("not indirect thunk opcode");
33347 }
33348 
33349 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
33350                                           unsigned Reg) {
33351   if (Subtarget.useRetpolineExternalThunk()) {
33352     // When using an external thunk for retpolines, we pick names that match the
33353     // names GCC happens to use as well. This helps simplify the implementation
33354     // of the thunks for kernels where they have no easy ability to create
33355     // aliases and are doing non-trivial configuration of the thunk's body. For
33356     // example, the Linux kernel will do boot-time hot patching of the thunk
33357     // bodies and cannot easily export aliases of these to loaded modules.
33358     //
33359     // Note that at any point in the future, we may need to change the semantics
33360     // of how we implement retpolines and at that time will likely change the
33361     // name of the called thunk. Essentially, there is no hard guarantee that
33362     // LLVM will generate calls to specific thunks, we merely make a best-effort
33363     // attempt to help out kernels and other systems where duplicating the
33364     // thunks is costly.
33365     switch (Reg) {
33366     case X86::EAX:
33367       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33368       return "__x86_indirect_thunk_eax";
33369     case X86::ECX:
33370       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33371       return "__x86_indirect_thunk_ecx";
33372     case X86::EDX:
33373       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33374       return "__x86_indirect_thunk_edx";
33375     case X86::EDI:
33376       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33377       return "__x86_indirect_thunk_edi";
33378     case X86::R11:
33379       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33380       return "__x86_indirect_thunk_r11";
33381     }
33382     llvm_unreachable("unexpected reg for external indirect thunk");
33383   }
33384 
33385   if (Subtarget.useRetpolineIndirectCalls() ||
33386       Subtarget.useRetpolineIndirectBranches()) {
33387     // When targeting an internal COMDAT thunk use an LLVM-specific name.
33388     switch (Reg) {
33389     case X86::EAX:
33390       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33391       return "__llvm_retpoline_eax";
33392     case X86::ECX:
33393       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33394       return "__llvm_retpoline_ecx";
33395     case X86::EDX:
33396       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33397       return "__llvm_retpoline_edx";
33398     case X86::EDI:
33399       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
33400       return "__llvm_retpoline_edi";
33401     case X86::R11:
33402       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33403       return "__llvm_retpoline_r11";
33404     }
33405     llvm_unreachable("unexpected reg for retpoline");
33406   }
33407 
33408   if (Subtarget.useLVIControlFlowIntegrity()) {
33409     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
33410     return "__llvm_lvi_thunk_r11";
33411   }
33412   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
33413 }
33414 
33415 MachineBasicBlock *
33416 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
33417                                             MachineBasicBlock *BB) const {
33418   // Copy the virtual register into the R11 physical register and
33419   // call the retpoline thunk.
33420   const DebugLoc &DL = MI.getDebugLoc();
33421   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33422   Register CalleeVReg = MI.getOperand(0).getReg();
33423   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
33424 
33425   // Find an available scratch register to hold the callee. On 64-bit, we can
33426   // just use R11, but we scan for uses anyway to ensure we don't generate
33427   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
33428   // already a register use operand to the call to hold the callee. If none
33429   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
33430   // register and ESI is the base pointer to realigned stack frames with VLAs.
33431   SmallVector<unsigned, 3> AvailableRegs;
33432   if (Subtarget.is64Bit())
33433     AvailableRegs.push_back(X86::R11);
33434   else
33435     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
33436 
33437   // Zero out any registers that are already used.
33438   for (const auto &MO : MI.operands()) {
33439     if (MO.isReg() && MO.isUse())
33440       for (unsigned &Reg : AvailableRegs)
33441         if (Reg == MO.getReg())
33442           Reg = 0;
33443   }
33444 
33445   // Choose the first remaining non-zero available register.
33446   unsigned AvailableReg = 0;
33447   for (unsigned MaybeReg : AvailableRegs) {
33448     if (MaybeReg) {
33449       AvailableReg = MaybeReg;
33450       break;
33451     }
33452   }
33453   if (!AvailableReg)
33454     report_fatal_error("calling convention incompatible with retpoline, no "
33455                        "available registers");
33456 
33457   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
33458 
33459   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
33460       .addReg(CalleeVReg);
33461   MI.getOperand(0).ChangeToES(Symbol);
33462   MI.setDesc(TII->get(Opc));
33463   MachineInstrBuilder(*BB->getParent(), &MI)
33464       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
33465   return BB;
33466 }
33467 
33468 /// SetJmp implies future control flow change upon calling the corresponding
33469 /// LongJmp.
33470 /// Instead of using the 'return' instruction, the long jump fixes the stack and
33471 /// performs an indirect branch. To do so it uses the registers that were stored
33472 /// in the jump buffer (when calling SetJmp).
33473 /// In case the shadow stack is enabled we need to fix it as well, because some
33474 /// return addresses will be skipped.
33475 /// The function will save the SSP for future fixing in the function
33476 /// emitLongJmpShadowStackFix.
33477 /// \sa emitLongJmpShadowStackFix
33478 /// \param [in] MI The temporary Machine Instruction for the builtin.
33479 /// \param [in] MBB The Machine Basic Block that will be modified.
33480 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
33481                                                  MachineBasicBlock *MBB) const {
33482   const DebugLoc &DL = MI.getDebugLoc();
33483   MachineFunction *MF = MBB->getParent();
33484   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33485   MachineRegisterInfo &MRI = MF->getRegInfo();
33486   MachineInstrBuilder MIB;
33487 
33488   // Memory Reference.
33489   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33490                                            MI.memoperands_end());
33491 
33492   // Initialize a register with zero.
33493   MVT PVT = getPointerTy(MF->getDataLayout());
33494   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33495   Register ZReg = MRI.createVirtualRegister(PtrRC);
33496   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
33497   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
33498       .addDef(ZReg)
33499       .addReg(ZReg, RegState::Undef)
33500       .addReg(ZReg, RegState::Undef);
33501 
33502   // Read the current SSP Register value to the zeroed register.
33503   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33504   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33505   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33506 
33507   // Write the SSP register value to offset 3 in input memory buffer.
33508   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33509   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
33510   const int64_t SSPOffset = 3 * PVT.getStoreSize();
33511   const unsigned MemOpndSlot = 1;
33512   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33513     if (i == X86::AddrDisp)
33514       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
33515     else
33516       MIB.add(MI.getOperand(MemOpndSlot + i));
33517   }
33518   MIB.addReg(SSPCopyReg);
33519   MIB.setMemRefs(MMOs);
33520 }
33521 
33522 MachineBasicBlock *
33523 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
33524                                     MachineBasicBlock *MBB) const {
33525   const DebugLoc &DL = MI.getDebugLoc();
33526   MachineFunction *MF = MBB->getParent();
33527   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33528   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33529   MachineRegisterInfo &MRI = MF->getRegInfo();
33530 
33531   const BasicBlock *BB = MBB->getBasicBlock();
33532   MachineFunction::iterator I = ++MBB->getIterator();
33533 
33534   // Memory Reference
33535   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33536                                            MI.memoperands_end());
33537 
33538   unsigned DstReg;
33539   unsigned MemOpndSlot = 0;
33540 
33541   unsigned CurOp = 0;
33542 
33543   DstReg = MI.getOperand(CurOp++).getReg();
33544   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33545   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
33546   (void)TRI;
33547   Register mainDstReg = MRI.createVirtualRegister(RC);
33548   Register restoreDstReg = MRI.createVirtualRegister(RC);
33549 
33550   MemOpndSlot = CurOp;
33551 
33552   MVT PVT = getPointerTy(MF->getDataLayout());
33553   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33554          "Invalid Pointer Size!");
33555 
33556   // For v = setjmp(buf), we generate
33557   //
33558   // thisMBB:
33559   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
33560   //  SjLjSetup restoreMBB
33561   //
33562   // mainMBB:
33563   //  v_main = 0
33564   //
33565   // sinkMBB:
33566   //  v = phi(main, restore)
33567   //
33568   // restoreMBB:
33569   //  if base pointer being used, load it from frame
33570   //  v_restore = 1
33571 
33572   MachineBasicBlock *thisMBB = MBB;
33573   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33574   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33575   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
33576   MF->insert(I, mainMBB);
33577   MF->insert(I, sinkMBB);
33578   MF->push_back(restoreMBB);
33579   restoreMBB->setHasAddressTaken();
33580 
33581   MachineInstrBuilder MIB;
33582 
33583   // Transfer the remainder of BB and its successor edges to sinkMBB.
33584   sinkMBB->splice(sinkMBB->begin(), MBB,
33585                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33586   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33587 
33588   // thisMBB:
33589   unsigned PtrStoreOpc = 0;
33590   unsigned LabelReg = 0;
33591   const int64_t LabelOffset = 1 * PVT.getStoreSize();
33592   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33593                      !isPositionIndependent();
33594 
33595   // Prepare IP either in reg or imm.
33596   if (!UseImmLabel) {
33597     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33598     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33599     LabelReg = MRI.createVirtualRegister(PtrRC);
33600     if (Subtarget.is64Bit()) {
33601       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
33602               .addReg(X86::RIP)
33603               .addImm(0)
33604               .addReg(0)
33605               .addMBB(restoreMBB)
33606               .addReg(0);
33607     } else {
33608       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
33609       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
33610               .addReg(XII->getGlobalBaseReg(MF))
33611               .addImm(0)
33612               .addReg(0)
33613               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
33614               .addReg(0);
33615     }
33616   } else
33617     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33618   // Store IP
33619   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
33620   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33621     if (i == X86::AddrDisp)
33622       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
33623     else
33624       MIB.add(MI.getOperand(MemOpndSlot + i));
33625   }
33626   if (!UseImmLabel)
33627     MIB.addReg(LabelReg);
33628   else
33629     MIB.addMBB(restoreMBB);
33630   MIB.setMemRefs(MMOs);
33631 
33632   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33633     emitSetJmpShadowStackFix(MI, thisMBB);
33634   }
33635 
33636   // Setup
33637   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
33638           .addMBB(restoreMBB);
33639 
33640   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33641   MIB.addRegMask(RegInfo->getNoPreservedMask());
33642   thisMBB->addSuccessor(mainMBB);
33643   thisMBB->addSuccessor(restoreMBB);
33644 
33645   // mainMBB:
33646   //  EAX = 0
33647   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
33648   mainMBB->addSuccessor(sinkMBB);
33649 
33650   // sinkMBB:
33651   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
33652           TII->get(X86::PHI), DstReg)
33653     .addReg(mainDstReg).addMBB(mainMBB)
33654     .addReg(restoreDstReg).addMBB(restoreMBB);
33655 
33656   // restoreMBB:
33657   if (RegInfo->hasBasePointer(*MF)) {
33658     const bool Uses64BitFramePtr =
33659         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33660     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
33661     X86FI->setRestoreBasePointer(MF);
33662     Register FramePtr = RegInfo->getFrameRegister(*MF);
33663     Register BasePtr = RegInfo->getBaseRegister();
33664     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
33665     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
33666                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
33667       .setMIFlag(MachineInstr::FrameSetup);
33668   }
33669   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
33670   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33671   restoreMBB->addSuccessor(sinkMBB);
33672 
33673   MI.eraseFromParent();
33674   return sinkMBB;
33675 }
33676 
33677 /// Fix the shadow stack using the previously saved SSP pointer.
33678 /// \sa emitSetJmpShadowStackFix
33679 /// \param [in] MI The temporary Machine Instruction for the builtin.
33680 /// \param [in] MBB The Machine Basic Block that will be modified.
33681 /// \return The sink MBB that will perform the future indirect branch.
33682 MachineBasicBlock *
33683 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33684                                              MachineBasicBlock *MBB) const {
33685   const DebugLoc &DL = MI.getDebugLoc();
33686   MachineFunction *MF = MBB->getParent();
33687   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33688   MachineRegisterInfo &MRI = MF->getRegInfo();
33689 
33690   // Memory Reference
33691   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33692                                            MI.memoperands_end());
33693 
33694   MVT PVT = getPointerTy(MF->getDataLayout());
33695   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33696 
33697   // checkSspMBB:
33698   //         xor vreg1, vreg1
33699   //         rdssp vreg1
33700   //         test vreg1, vreg1
33701   //         je sinkMBB   # Jump if Shadow Stack is not supported
33702   // fallMBB:
33703   //         mov buf+24/12(%rip), vreg2
33704   //         sub vreg1, vreg2
33705   //         jbe sinkMBB  # No need to fix the Shadow Stack
33706   // fixShadowMBB:
33707   //         shr 3/2, vreg2
33708   //         incssp vreg2  # fix the SSP according to the lower 8 bits
33709   //         shr 8, vreg2
33710   //         je sinkMBB
33711   // fixShadowLoopPrepareMBB:
33712   //         shl vreg2
33713   //         mov 128, vreg3
33714   // fixShadowLoopMBB:
33715   //         incssp vreg3
33716   //         dec vreg2
33717   //         jne fixShadowLoopMBB # Iterate until you finish fixing
33718   //                              # the Shadow Stack
33719   // sinkMBB:
33720 
33721   MachineFunction::iterator I = ++MBB->getIterator();
33722   const BasicBlock *BB = MBB->getBasicBlock();
33723 
33724   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33725   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33726   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33727   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33728   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33729   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33730   MF->insert(I, checkSspMBB);
33731   MF->insert(I, fallMBB);
33732   MF->insert(I, fixShadowMBB);
33733   MF->insert(I, fixShadowLoopPrepareMBB);
33734   MF->insert(I, fixShadowLoopMBB);
33735   MF->insert(I, sinkMBB);
33736 
33737   // Transfer the remainder of BB and its successor edges to sinkMBB.
33738   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33739                   MBB->end());
33740   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33741 
33742   MBB->addSuccessor(checkSspMBB);
33743 
33744   // Initialize a register with zero.
33745   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33746   BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33747 
33748   if (PVT == MVT::i64) {
33749     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33750     BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33751       .addImm(0)
33752       .addReg(ZReg)
33753       .addImm(X86::sub_32bit);
33754     ZReg = TmpZReg;
33755   }
33756 
33757   // Read the current SSP Register value to the zeroed register.
33758   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33759   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33760   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33761 
33762   // Check whether the result of the SSP register is zero and jump directly
33763   // to the sink.
33764   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33765   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33766       .addReg(SSPCopyReg)
33767       .addReg(SSPCopyReg);
33768   BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33769   checkSspMBB->addSuccessor(sinkMBB);
33770   checkSspMBB->addSuccessor(fallMBB);
33771 
33772   // Reload the previously saved SSP register value.
33773   Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33774   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33775   const int64_t SPPOffset = 3 * PVT.getStoreSize();
33776   MachineInstrBuilder MIB =
33777       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33778   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33779     const MachineOperand &MO = MI.getOperand(i);
33780     if (i == X86::AddrDisp)
33781       MIB.addDisp(MO, SPPOffset);
33782     else if (MO.isReg()) // Don't add the whole operand, we don't want to
33783                          // preserve kill flags.
33784       MIB.addReg(MO.getReg());
33785     else
33786       MIB.add(MO);
33787   }
33788   MIB.setMemRefs(MMOs);
33789 
33790   // Subtract the current SSP from the previous SSP.
33791   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33792   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33793   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33794       .addReg(PrevSSPReg)
33795       .addReg(SSPCopyReg);
33796 
33797   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33798   BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33799   fallMBB->addSuccessor(sinkMBB);
33800   fallMBB->addSuccessor(fixShadowMBB);
33801 
33802   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33803   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33804   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33805   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33806   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33807       .addReg(SspSubReg)
33808       .addImm(Offset);
33809 
33810   // Increase SSP when looking only on the lower 8 bits of the delta.
33811   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33812   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33813 
33814   // Reset the lower 8 bits.
33815   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33816   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33817       .addReg(SspFirstShrReg)
33818       .addImm(8);
33819 
33820   // Jump if the result of the shift is zero.
33821   BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33822   fixShadowMBB->addSuccessor(sinkMBB);
33823   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33824 
33825   // Do a single shift left.
33826   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33827   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33828   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33829       .addReg(SspSecondShrReg);
33830 
33831   // Save the value 128 to a register (will be used next with incssp).
33832   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33833   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33834   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33835       .addImm(128);
33836   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33837 
33838   // Since incssp only looks at the lower 8 bits, we might need to do several
33839   // iterations of incssp until we finish fixing the shadow stack.
33840   Register DecReg = MRI.createVirtualRegister(PtrRC);
33841   Register CounterReg = MRI.createVirtualRegister(PtrRC);
33842   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33843       .addReg(SspAfterShlReg)
33844       .addMBB(fixShadowLoopPrepareMBB)
33845       .addReg(DecReg)
33846       .addMBB(fixShadowLoopMBB);
33847 
33848   // Every iteration we increase the SSP by 128.
33849   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33850 
33851   // Every iteration we decrement the counter by 1.
33852   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33853   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33854 
33855   // Jump if the counter is not zero yet.
33856   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33857   fixShadowLoopMBB->addSuccessor(sinkMBB);
33858   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33859 
33860   return sinkMBB;
33861 }
33862 
33863 MachineBasicBlock *
33864 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33865                                      MachineBasicBlock *MBB) const {
33866   const DebugLoc &DL = MI.getDebugLoc();
33867   MachineFunction *MF = MBB->getParent();
33868   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33869   MachineRegisterInfo &MRI = MF->getRegInfo();
33870 
33871   // Memory Reference
33872   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33873                                            MI.memoperands_end());
33874 
33875   MVT PVT = getPointerTy(MF->getDataLayout());
33876   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
33877          "Invalid Pointer Size!");
33878 
33879   const TargetRegisterClass *RC =
33880     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33881   Register Tmp = MRI.createVirtualRegister(RC);
33882   // Since FP is only updated here but NOT referenced, it's treated as GPR.
33883   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33884   Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33885   Register SP = RegInfo->getStackRegister();
33886 
33887   MachineInstrBuilder MIB;
33888 
33889   const int64_t LabelOffset = 1 * PVT.getStoreSize();
33890   const int64_t SPOffset = 2 * PVT.getStoreSize();
33891 
33892   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33893   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33894 
33895   MachineBasicBlock *thisMBB = MBB;
33896 
33897   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33898   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33899     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33900   }
33901 
33902   // Reload FP
33903   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33904   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33905     const MachineOperand &MO = MI.getOperand(i);
33906     if (MO.isReg()) // Don't add the whole operand, we don't want to
33907                     // preserve kill flags.
33908       MIB.addReg(MO.getReg());
33909     else
33910       MIB.add(MO);
33911   }
33912   MIB.setMemRefs(MMOs);
33913 
33914   // Reload IP
33915   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33916   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33917     const MachineOperand &MO = MI.getOperand(i);
33918     if (i == X86::AddrDisp)
33919       MIB.addDisp(MO, LabelOffset);
33920     else if (MO.isReg()) // Don't add the whole operand, we don't want to
33921                          // preserve kill flags.
33922       MIB.addReg(MO.getReg());
33923     else
33924       MIB.add(MO);
33925   }
33926   MIB.setMemRefs(MMOs);
33927 
33928   // Reload SP
33929   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33930   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33931     if (i == X86::AddrDisp)
33932       MIB.addDisp(MI.getOperand(i), SPOffset);
33933     else
33934       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33935                                  // the last instruction of the expansion.
33936   }
33937   MIB.setMemRefs(MMOs);
33938 
33939   // Jump
33940   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33941 
33942   MI.eraseFromParent();
33943   return thisMBB;
33944 }
33945 
33946 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33947                                                MachineBasicBlock *MBB,
33948                                                MachineBasicBlock *DispatchBB,
33949                                                int FI) const {
33950   const DebugLoc &DL = MI.getDebugLoc();
33951   MachineFunction *MF = MBB->getParent();
33952   MachineRegisterInfo *MRI = &MF->getRegInfo();
33953   const X86InstrInfo *TII = Subtarget.getInstrInfo();
33954 
33955   MVT PVT = getPointerTy(MF->getDataLayout());
33956   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
33957 
33958   unsigned Op = 0;
33959   unsigned VR = 0;
33960 
33961   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33962                      !isPositionIndependent();
33963 
33964   if (UseImmLabel) {
33965     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33966   } else {
33967     const TargetRegisterClass *TRC =
33968         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33969     VR = MRI->createVirtualRegister(TRC);
33970     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33971 
33972     if (Subtarget.is64Bit())
33973       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33974           .addReg(X86::RIP)
33975           .addImm(1)
33976           .addReg(0)
33977           .addMBB(DispatchBB)
33978           .addReg(0);
33979     else
33980       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33981           .addReg(0) /* TII->getGlobalBaseReg(MF) */
33982           .addImm(1)
33983           .addReg(0)
33984           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33985           .addReg(0);
33986   }
33987 
33988   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33989   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33990   if (UseImmLabel)
33991     MIB.addMBB(DispatchBB);
33992   else
33993     MIB.addReg(VR);
33994 }
33995 
33996 MachineBasicBlock *
33997 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
33998                                          MachineBasicBlock *BB) const {
33999   const DebugLoc &DL = MI.getDebugLoc();
34000   MachineFunction *MF = BB->getParent();
34001   MachineRegisterInfo *MRI = &MF->getRegInfo();
34002   const X86InstrInfo *TII = Subtarget.getInstrInfo();
34003   int FI = MF->getFrameInfo().getFunctionContextIndex();
34004 
34005   // Get a mapping of the call site numbers to all of the landing pads they're
34006   // associated with.
34007   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
34008   unsigned MaxCSNum = 0;
34009   for (auto &MBB : *MF) {
34010     if (!MBB.isEHPad())
34011       continue;
34012 
34013     MCSymbol *Sym = nullptr;
34014     for (const auto &MI : MBB) {
34015       if (MI.isDebugInstr())
34016         continue;
34017 
34018       assert(MI.isEHLabel() && "expected EH_LABEL");
34019       Sym = MI.getOperand(0).getMCSymbol();
34020       break;
34021     }
34022 
34023     if (!MF->hasCallSiteLandingPad(Sym))
34024       continue;
34025 
34026     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
34027       CallSiteNumToLPad[CSI].push_back(&MBB);
34028       MaxCSNum = std::max(MaxCSNum, CSI);
34029     }
34030   }
34031 
34032   // Get an ordered list of the machine basic blocks for the jump table.
34033   std::vector<MachineBasicBlock *> LPadList;
34034   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
34035   LPadList.reserve(CallSiteNumToLPad.size());
34036 
34037   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
34038     for (auto &LP : CallSiteNumToLPad[CSI]) {
34039       LPadList.push_back(LP);
34040       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
34041     }
34042   }
34043 
34044   assert(!LPadList.empty() &&
34045          "No landing pad destinations for the dispatch jump table!");
34046 
34047   // Create the MBBs for the dispatch code.
34048 
34049   // Shove the dispatch's address into the return slot in the function context.
34050   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
34051   DispatchBB->setIsEHPad(true);
34052 
34053   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
34054   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
34055   DispatchBB->addSuccessor(TrapBB);
34056 
34057   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
34058   DispatchBB->addSuccessor(DispContBB);
34059 
34060   // Insert MBBs.
34061   MF->push_back(DispatchBB);
34062   MF->push_back(DispContBB);
34063   MF->push_back(TrapBB);
34064 
34065   // Insert code into the entry block that creates and registers the function
34066   // context.
34067   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
34068 
34069   // Create the jump table and associated information
34070   unsigned JTE = getJumpTableEncoding();
34071   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
34072   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
34073 
34074   const X86RegisterInfo &RI = TII->getRegisterInfo();
34075   // Add a register mask with no preserved registers.  This results in all
34076   // registers being marked as clobbered.
34077   if (RI.hasBasePointer(*MF)) {
34078     const bool FPIs64Bit =
34079         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34080     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
34081     MFI->setRestoreBasePointer(MF);
34082 
34083     Register FP = RI.getFrameRegister(*MF);
34084     Register BP = RI.getBaseRegister();
34085     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
34086     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
34087                  MFI->getRestoreBasePointerOffset())
34088         .addRegMask(RI.getNoPreservedMask());
34089   } else {
34090     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
34091         .addRegMask(RI.getNoPreservedMask());
34092   }
34093 
34094   // IReg is used as an index in a memory operand and therefore can't be SP
34095   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
34096   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
34097                     Subtarget.is64Bit() ? 8 : 4);
34098   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
34099       .addReg(IReg)
34100       .addImm(LPadList.size());
34101   BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
34102 
34103   if (Subtarget.is64Bit()) {
34104     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34105     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
34106 
34107     // leaq .LJTI0_0(%rip), BReg
34108     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
34109         .addReg(X86::RIP)
34110         .addImm(1)
34111         .addReg(0)
34112         .addJumpTableIndex(MJTI)
34113         .addReg(0);
34114     // movzx IReg64, IReg
34115     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
34116         .addImm(0)
34117         .addReg(IReg)
34118         .addImm(X86::sub_32bit);
34119 
34120     switch (JTE) {
34121     case MachineJumpTableInfo::EK_BlockAddress:
34122       // jmpq *(BReg,IReg64,8)
34123       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
34124           .addReg(BReg)
34125           .addImm(8)
34126           .addReg(IReg64)
34127           .addImm(0)
34128           .addReg(0);
34129       break;
34130     case MachineJumpTableInfo::EK_LabelDifference32: {
34131       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
34132       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
34133       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34134 
34135       // movl (BReg,IReg64,4), OReg
34136       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
34137           .addReg(BReg)
34138           .addImm(4)
34139           .addReg(IReg64)
34140           .addImm(0)
34141           .addReg(0);
34142       // movsx OReg64, OReg
34143       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
34144       // addq BReg, OReg64, TReg
34145       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
34146           .addReg(OReg64)
34147           .addReg(BReg);
34148       // jmpq *TReg
34149       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
34150       break;
34151     }
34152     default:
34153       llvm_unreachable("Unexpected jump table encoding");
34154     }
34155   } else {
34156     // jmpl *.LJTI0_0(,IReg,4)
34157     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
34158         .addReg(0)
34159         .addImm(4)
34160         .addReg(IReg)
34161         .addJumpTableIndex(MJTI)
34162         .addReg(0);
34163   }
34164 
34165   // Add the jump table entries as successors to the MBB.
34166   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
34167   for (auto &LP : LPadList)
34168     if (SeenMBBs.insert(LP).second)
34169       DispContBB->addSuccessor(LP);
34170 
34171   // N.B. the order the invoke BBs are processed in doesn't matter here.
34172   SmallVector<MachineBasicBlock *, 64> MBBLPads;
34173   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
34174   for (MachineBasicBlock *MBB : InvokeBBs) {
34175     // Remove the landing pad successor from the invoke block and replace it
34176     // with the new dispatch block.
34177     // Keep a copy of Successors since it's modified inside the loop.
34178     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
34179                                                    MBB->succ_rend());
34180     // FIXME: Avoid quadratic complexity.
34181     for (auto MBBS : Successors) {
34182       if (MBBS->isEHPad()) {
34183         MBB->removeSuccessor(MBBS);
34184         MBBLPads.push_back(MBBS);
34185       }
34186     }
34187 
34188     MBB->addSuccessor(DispatchBB);
34189 
34190     // Find the invoke call and mark all of the callee-saved registers as
34191     // 'implicit defined' so that they're spilled.  This prevents code from
34192     // moving instructions to before the EH block, where they will never be
34193     // executed.
34194     for (auto &II : reverse(*MBB)) {
34195       if (!II.isCall())
34196         continue;
34197 
34198       DenseMap<unsigned, bool> DefRegs;
34199       for (auto &MOp : II.operands())
34200         if (MOp.isReg())
34201           DefRegs[MOp.getReg()] = true;
34202 
34203       MachineInstrBuilder MIB(*MF, &II);
34204       for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34205         unsigned Reg = SavedRegs[RegIdx];
34206         if (!DefRegs[Reg])
34207           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34208       }
34209 
34210       break;
34211     }
34212   }
34213 
34214   // Mark all former landing pads as non-landing pads.  The dispatch is the only
34215   // landing pad now.
34216   for (auto &LP : MBBLPads)
34217     LP->setIsEHPad(false);
34218 
34219   // The instruction is gone now.
34220   MI.eraseFromParent();
34221   return BB;
34222 }
34223 
34224 MachineBasicBlock *
34225 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34226                                                MachineBasicBlock *BB) const {
34227   MachineFunction *MF = BB->getParent();
34228   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34229   const DebugLoc &DL = MI.getDebugLoc();
34230 
34231   auto TMMImmToTMMReg = [](unsigned Imm) {
34232     assert (Imm < 8 && "Illegal tmm index");
34233     return X86::TMM0 + Imm;
34234   };
34235   switch (MI.getOpcode()) {
34236   default: llvm_unreachable("Unexpected instr type to insert");
34237   case X86::TLS_addr32:
34238   case X86::TLS_addr64:
34239   case X86::TLS_addrX32:
34240   case X86::TLS_base_addr32:
34241   case X86::TLS_base_addr64:
34242   case X86::TLS_base_addrX32:
34243     return EmitLoweredTLSAddr(MI, BB);
34244   case X86::INDIRECT_THUNK_CALL32:
34245   case X86::INDIRECT_THUNK_CALL64:
34246   case X86::INDIRECT_THUNK_TCRETURN32:
34247   case X86::INDIRECT_THUNK_TCRETURN64:
34248     return EmitLoweredIndirectThunk(MI, BB);
34249   case X86::CATCHRET:
34250     return EmitLoweredCatchRet(MI, BB);
34251   case X86::SEG_ALLOCA_32:
34252   case X86::SEG_ALLOCA_64:
34253     return EmitLoweredSegAlloca(MI, BB);
34254   case X86::PROBED_ALLOCA_32:
34255   case X86::PROBED_ALLOCA_64:
34256     return EmitLoweredProbedAlloca(MI, BB);
34257   case X86::TLSCall_32:
34258   case X86::TLSCall_64:
34259     return EmitLoweredTLSCall(MI, BB);
34260   case X86::CMOV_FR32:
34261   case X86::CMOV_FR32X:
34262   case X86::CMOV_FR64:
34263   case X86::CMOV_FR64X:
34264   case X86::CMOV_GR8:
34265   case X86::CMOV_GR16:
34266   case X86::CMOV_GR32:
34267   case X86::CMOV_RFP32:
34268   case X86::CMOV_RFP64:
34269   case X86::CMOV_RFP80:
34270   case X86::CMOV_VR64:
34271   case X86::CMOV_VR128:
34272   case X86::CMOV_VR128X:
34273   case X86::CMOV_VR256:
34274   case X86::CMOV_VR256X:
34275   case X86::CMOV_VR512:
34276   case X86::CMOV_VK1:
34277   case X86::CMOV_VK2:
34278   case X86::CMOV_VK4:
34279   case X86::CMOV_VK8:
34280   case X86::CMOV_VK16:
34281   case X86::CMOV_VK32:
34282   case X86::CMOV_VK64:
34283     return EmitLoweredSelect(MI, BB);
34284 
34285   case X86::RDFLAGS32:
34286   case X86::RDFLAGS64: {
34287     unsigned PushF =
34288         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34289     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34290     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34291     // Permit reads of the EFLAGS and DF registers without them being defined.
34292     // This intrinsic exists to read external processor state in flags, such as
34293     // the trap flag, interrupt flag, and direction flag, none of which are
34294     // modeled by the backend.
34295     assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
34296            "Unexpected register in operand!");
34297     Push->getOperand(2).setIsUndef();
34298     assert(Push->getOperand(3).getReg() == X86::DF &&
34299            "Unexpected register in operand!");
34300     Push->getOperand(3).setIsUndef();
34301     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34302 
34303     MI.eraseFromParent(); // The pseudo is gone now.
34304     return BB;
34305   }
34306 
34307   case X86::WRFLAGS32:
34308   case X86::WRFLAGS64: {
34309     unsigned Push =
34310         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34311     unsigned PopF =
34312         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34313     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34314     BuildMI(*BB, MI, DL, TII->get(PopF));
34315 
34316     MI.eraseFromParent(); // The pseudo is gone now.
34317     return BB;
34318   }
34319 
34320   case X86::FP32_TO_INT16_IN_MEM:
34321   case X86::FP32_TO_INT32_IN_MEM:
34322   case X86::FP32_TO_INT64_IN_MEM:
34323   case X86::FP64_TO_INT16_IN_MEM:
34324   case X86::FP64_TO_INT32_IN_MEM:
34325   case X86::FP64_TO_INT64_IN_MEM:
34326   case X86::FP80_TO_INT16_IN_MEM:
34327   case X86::FP80_TO_INT32_IN_MEM:
34328   case X86::FP80_TO_INT64_IN_MEM: {
34329     // Change the floating point control register to use "round towards zero"
34330     // mode when truncating to an integer value.
34331     int OrigCWFrameIdx =
34332         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34333     addFrameReference(BuildMI(*BB, MI, DL,
34334                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34335 
34336     // Load the old value of the control word...
34337     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34338     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34339                       OrigCWFrameIdx);
34340 
34341     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34342     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34343     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34344       .addReg(OldCW, RegState::Kill).addImm(0xC00);
34345 
34346     // Extract to 16 bits.
34347     Register NewCW16 =
34348         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
34349     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
34350       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
34351 
34352     // Prepare memory for FLDCW.
34353     int NewCWFrameIdx =
34354         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34355     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
34356                       NewCWFrameIdx)
34357       .addReg(NewCW16, RegState::Kill);
34358 
34359     // Reload the modified control word now...
34360     addFrameReference(BuildMI(*BB, MI, DL,
34361                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
34362 
34363     // Get the X86 opcode to use.
34364     unsigned Opc;
34365     switch (MI.getOpcode()) {
34366     default: llvm_unreachable("illegal opcode!");
34367     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
34368     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
34369     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
34370     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
34371     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
34372     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
34373     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
34374     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
34375     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
34376     }
34377 
34378     X86AddressMode AM = getAddressFromInstr(&MI, 0);
34379     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
34380         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
34381 
34382     // Reload the original control word now.
34383     addFrameReference(BuildMI(*BB, MI, DL,
34384                               TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
34385 
34386     MI.eraseFromParent(); // The pseudo instruction is gone now.
34387     return BB;
34388   }
34389 
34390   // xbegin
34391   case X86::XBEGIN:
34392     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
34393 
34394   case X86::VAARG_64:
34395   case X86::VAARG_X32:
34396     return EmitVAARGWithCustomInserter(MI, BB);
34397 
34398   case X86::EH_SjLj_SetJmp32:
34399   case X86::EH_SjLj_SetJmp64:
34400     return emitEHSjLjSetJmp(MI, BB);
34401 
34402   case X86::EH_SjLj_LongJmp32:
34403   case X86::EH_SjLj_LongJmp64:
34404     return emitEHSjLjLongJmp(MI, BB);
34405 
34406   case X86::Int_eh_sjlj_setup_dispatch:
34407     return EmitSjLjDispatchBlock(MI, BB);
34408 
34409   case TargetOpcode::STATEPOINT:
34410     // As an implementation detail, STATEPOINT shares the STACKMAP format at
34411     // this point in the process.  We diverge later.
34412     return emitPatchPoint(MI, BB);
34413 
34414   case TargetOpcode::STACKMAP:
34415   case TargetOpcode::PATCHPOINT:
34416     return emitPatchPoint(MI, BB);
34417 
34418   case TargetOpcode::PATCHABLE_EVENT_CALL:
34419   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
34420     return BB;
34421 
34422   case X86::LCMPXCHG8B: {
34423     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34424     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
34425     // requires a memory operand. If it happens that current architecture is
34426     // i686 and for current function we need a base pointer
34427     // - which is ESI for i686 - register allocator would not be able to
34428     // allocate registers for an address in form of X(%reg, %reg, Y)
34429     // - there never would be enough unreserved registers during regalloc
34430     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
34431     // We are giving a hand to register allocator by precomputing the address in
34432     // a new vreg using LEA.
34433 
34434     // If it is not i686 or there is no base pointer - nothing to do here.
34435     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
34436       return BB;
34437 
34438     // Even though this code does not necessarily needs the base pointer to
34439     // be ESI, we check for that. The reason: if this assert fails, there are
34440     // some changes happened in the compiler base pointer handling, which most
34441     // probably have to be addressed somehow here.
34442     assert(TRI->getBaseRegister() == X86::ESI &&
34443            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
34444            "base pointer in mind");
34445 
34446     MachineRegisterInfo &MRI = MF->getRegInfo();
34447     MVT SPTy = getPointerTy(MF->getDataLayout());
34448     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
34449     Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
34450 
34451     X86AddressMode AM = getAddressFromInstr(&MI, 0);
34452     // Regalloc does not need any help when the memory operand of CMPXCHG8B
34453     // does not use index register.
34454     if (AM.IndexReg == X86::NoRegister)
34455       return BB;
34456 
34457     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
34458     // four operand definitions that are E[ABCD] registers. We skip them and
34459     // then insert the LEA.
34460     MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
34461     while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
34462                                    RMBBI->definesRegister(X86::EBX) ||
34463                                    RMBBI->definesRegister(X86::ECX) ||
34464                                    RMBBI->definesRegister(X86::EDX))) {
34465       ++RMBBI;
34466     }
34467     MachineBasicBlock::iterator MBBI(RMBBI);
34468     addFullAddress(
34469         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
34470 
34471     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
34472 
34473     return BB;
34474   }
34475   case X86::LCMPXCHG16B_NO_RBX: {
34476     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34477     Register BasePtr = TRI->getBaseRegister();
34478     if (TRI->hasBasePointer(*MF) &&
34479         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
34480       if (!BB->isLiveIn(BasePtr))
34481         BB->addLiveIn(BasePtr);
34482       // Save RBX into a virtual register.
34483       Register SaveRBX =
34484           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34485       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34486           .addReg(X86::RBX);
34487       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34488       MachineInstrBuilder MIB =
34489           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
34490       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34491         MIB.add(MI.getOperand(Idx));
34492       MIB.add(MI.getOperand(X86::AddrNumOperands));
34493       MIB.addReg(SaveRBX);
34494     } else {
34495       // Simple case, just copy the virtual register to RBX.
34496       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
34497           .add(MI.getOperand(X86::AddrNumOperands));
34498       MachineInstrBuilder MIB =
34499           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
34500       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34501         MIB.add(MI.getOperand(Idx));
34502     }
34503     MI.eraseFromParent();
34504     return BB;
34505   }
34506   case X86::MWAITX: {
34507     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34508     Register BasePtr = TRI->getBaseRegister();
34509     bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
34510     // If no need to save the base pointer, we generate MWAITXrrr,
34511     // else we generate pseudo MWAITX_SAVE_RBX.
34512     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
34513       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34514           .addReg(MI.getOperand(0).getReg());
34515       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34516           .addReg(MI.getOperand(1).getReg());
34517       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
34518           .addReg(MI.getOperand(2).getReg());
34519       BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
34520       MI.eraseFromParent();
34521     } else {
34522       if (!BB->isLiveIn(BasePtr)) {
34523         BB->addLiveIn(BasePtr);
34524       }
34525       // Parameters can be copied into ECX and EAX but not EBX yet.
34526       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34527           .addReg(MI.getOperand(0).getReg());
34528       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34529           .addReg(MI.getOperand(1).getReg());
34530       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
34531       // Save RBX into a virtual register.
34532       Register SaveRBX =
34533           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34534       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34535           .addReg(X86::RBX);
34536       // Generate mwaitx pseudo.
34537       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34538       BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
34539           .addDef(Dst) // Destination tied in with SaveRBX.
34540           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
34541           .addUse(SaveRBX);                  // Save of base pointer.
34542       MI.eraseFromParent();
34543     }
34544     return BB;
34545   }
34546   case TargetOpcode::PREALLOCATED_SETUP: {
34547     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
34548     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34549     MFI->setHasPreallocatedCall(true);
34550     int64_t PreallocatedId = MI.getOperand(0).getImm();
34551     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
34552     assert(StackAdjustment != 0 && "0 stack adjustment");
34553     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
34554                       << StackAdjustment << "\n");
34555     BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
34556         .addReg(X86::ESP)
34557         .addImm(StackAdjustment);
34558     MI.eraseFromParent();
34559     return BB;
34560   }
34561   case TargetOpcode::PREALLOCATED_ARG: {
34562     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
34563     int64_t PreallocatedId = MI.getOperand(1).getImm();
34564     int64_t ArgIdx = MI.getOperand(2).getImm();
34565     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34566     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
34567     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
34568                       << ", arg offset " << ArgOffset << "\n");
34569     // stack pointer + offset
34570     addRegOffset(
34571         BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
34572         X86::ESP, false, ArgOffset);
34573     MI.eraseFromParent();
34574     return BB;
34575   }
34576   case X86::PTDPBSSD:
34577   case X86::PTDPBSUD:
34578   case X86::PTDPBUSD:
34579   case X86::PTDPBUUD:
34580   case X86::PTDPBF16PS: {
34581     unsigned Opc;
34582     switch (MI.getOpcode()) {
34583     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
34584     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
34585     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
34586     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
34587     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
34588     }
34589 
34590     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34591     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
34592     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
34593     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
34594     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
34595 
34596     MI.eraseFromParent(); // The pseudo is gone now.
34597     return BB;
34598   }
34599   case X86::PTILEZERO: {
34600     unsigned Imm = MI.getOperand(0).getImm();
34601     BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
34602     MI.eraseFromParent(); // The pseudo is gone now.
34603     return BB;
34604   }
34605   case X86::PTILELOADD:
34606   case X86::PTILELOADDT1:
34607   case X86::PTILESTORED: {
34608     unsigned Opc;
34609     switch (MI.getOpcode()) {
34610     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
34611     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
34612     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
34613     }
34614 
34615     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34616     unsigned CurOp = 0;
34617     if (Opc != X86::TILESTORED)
34618       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34619                  RegState::Define);
34620 
34621     MIB.add(MI.getOperand(CurOp++)); // base
34622     MIB.add(MI.getOperand(CurOp++)); // scale
34623     MIB.add(MI.getOperand(CurOp++)); // index -- stride
34624     MIB.add(MI.getOperand(CurOp++)); // displacement
34625     MIB.add(MI.getOperand(CurOp++)); // segment
34626 
34627     if (Opc == X86::TILESTORED)
34628       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34629                  RegState::Undef);
34630 
34631     MI.eraseFromParent(); // The pseudo is gone now.
34632     return BB;
34633   }
34634   }
34635 }
34636 
34637 //===----------------------------------------------------------------------===//
34638 //                           X86 Optimization Hooks
34639 //===----------------------------------------------------------------------===//
34640 
34641 bool
34642 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
34643                                                 const APInt &DemandedBits,
34644                                                 const APInt &DemandedElts,
34645                                                 TargetLoweringOpt &TLO) const {
34646   EVT VT = Op.getValueType();
34647   unsigned Opcode = Op.getOpcode();
34648   unsigned EltSize = VT.getScalarSizeInBits();
34649 
34650   if (VT.isVector()) {
34651     // If the constant is only all signbits in the active bits, then we should
34652     // extend it to the entire constant to allow it act as a boolean constant
34653     // vector.
34654     auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
34655       if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
34656         return false;
34657       for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
34658         if (!DemandedElts[i] || V.getOperand(i).isUndef())
34659           continue;
34660         const APInt &Val = V.getConstantOperandAPInt(i);
34661         if (Val.getBitWidth() > Val.getNumSignBits() &&
34662             Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
34663           return true;
34664       }
34665       return false;
34666     };
34667     // For vectors - if we have a constant, then try to sign extend.
34668     // TODO: Handle AND/ANDN cases.
34669     unsigned ActiveBits = DemandedBits.getActiveBits();
34670     if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34671         (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34672         NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34673       EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34674       EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34675                                     VT.getVectorNumElements());
34676       SDValue NewC =
34677           TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34678                           Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34679       SDValue NewOp =
34680           TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34681       return TLO.CombineTo(Op, NewOp);
34682     }
34683     return false;
34684   }
34685 
34686   // Only optimize Ands to prevent shrinking a constant that could be
34687   // matched by movzx.
34688   if (Opcode != ISD::AND)
34689     return false;
34690 
34691   // Make sure the RHS really is a constant.
34692   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34693   if (!C)
34694     return false;
34695 
34696   const APInt &Mask = C->getAPIntValue();
34697 
34698   // Clear all non-demanded bits initially.
34699   APInt ShrunkMask = Mask & DemandedBits;
34700 
34701   // Find the width of the shrunk mask.
34702   unsigned Width = ShrunkMask.getActiveBits();
34703 
34704   // If the mask is all 0s there's nothing to do here.
34705   if (Width == 0)
34706     return false;
34707 
34708   // Find the next power of 2 width, rounding up to a byte.
34709   Width = PowerOf2Ceil(std::max(Width, 8U));
34710   // Truncate the width to size to handle illegal types.
34711   Width = std::min(Width, EltSize);
34712 
34713   // Calculate a possible zero extend mask for this constant.
34714   APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34715 
34716   // If we aren't changing the mask, just return true to keep it and prevent
34717   // the caller from optimizing.
34718   if (ZeroExtendMask == Mask)
34719     return true;
34720 
34721   // Make sure the new mask can be represented by a combination of mask bits
34722   // and non-demanded bits.
34723   if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34724     return false;
34725 
34726   // Replace the constant with the zero extend mask.
34727   SDLoc DL(Op);
34728   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34729   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34730   return TLO.CombineTo(Op, NewOp);
34731 }
34732 
34733 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34734                                                       KnownBits &Known,
34735                                                       const APInt &DemandedElts,
34736                                                       const SelectionDAG &DAG,
34737                                                       unsigned Depth) const {
34738   unsigned BitWidth = Known.getBitWidth();
34739   unsigned NumElts = DemandedElts.getBitWidth();
34740   unsigned Opc = Op.getOpcode();
34741   EVT VT = Op.getValueType();
34742   assert((Opc >= ISD::BUILTIN_OP_END ||
34743           Opc == ISD::INTRINSIC_WO_CHAIN ||
34744           Opc == ISD::INTRINSIC_W_CHAIN ||
34745           Opc == ISD::INTRINSIC_VOID) &&
34746          "Should use MaskedValueIsZero if you don't know whether Op"
34747          " is a target node!");
34748 
34749   Known.resetAll();
34750   switch (Opc) {
34751   default: break;
34752   case X86ISD::SETCC:
34753     Known.Zero.setBitsFrom(1);
34754     break;
34755   case X86ISD::MOVMSK: {
34756     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34757     Known.Zero.setBitsFrom(NumLoBits);
34758     break;
34759   }
34760   case X86ISD::PEXTRB:
34761   case X86ISD::PEXTRW: {
34762     SDValue Src = Op.getOperand(0);
34763     EVT SrcVT = Src.getValueType();
34764     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34765                                             Op.getConstantOperandVal(1));
34766     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34767     Known = Known.anyextOrTrunc(BitWidth);
34768     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34769     break;
34770   }
34771   case X86ISD::VSRAI:
34772   case X86ISD::VSHLI:
34773   case X86ISD::VSRLI: {
34774     unsigned ShAmt = Op.getConstantOperandVal(1);
34775     if (ShAmt >= VT.getScalarSizeInBits()) {
34776       Known.setAllZero();
34777       break;
34778     }
34779 
34780     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34781     if (Opc == X86ISD::VSHLI) {
34782       Known.Zero <<= ShAmt;
34783       Known.One <<= ShAmt;
34784       // Low bits are known zero.
34785       Known.Zero.setLowBits(ShAmt);
34786     } else if (Opc == X86ISD::VSRLI) {
34787       Known.Zero.lshrInPlace(ShAmt);
34788       Known.One.lshrInPlace(ShAmt);
34789       // High bits are known zero.
34790       Known.Zero.setHighBits(ShAmt);
34791     } else {
34792       Known.Zero.ashrInPlace(ShAmt);
34793       Known.One.ashrInPlace(ShAmt);
34794     }
34795     break;
34796   }
34797   case X86ISD::PACKUS: {
34798     // PACKUS is just a truncation if the upper half is zero.
34799     APInt DemandedLHS, DemandedRHS;
34800     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34801 
34802     Known.One = APInt::getAllOnesValue(BitWidth * 2);
34803     Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34804 
34805     KnownBits Known2;
34806     if (!!DemandedLHS) {
34807       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34808       Known = KnownBits::commonBits(Known, Known2);
34809     }
34810     if (!!DemandedRHS) {
34811       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34812       Known = KnownBits::commonBits(Known, Known2);
34813     }
34814 
34815     if (Known.countMinLeadingZeros() < BitWidth)
34816       Known.resetAll();
34817     Known = Known.trunc(BitWidth);
34818     break;
34819   }
34820   case X86ISD::VBROADCAST: {
34821     SDValue Src = Op.getOperand(0);
34822     if (!Src.getSimpleValueType().isVector()) {
34823       Known = DAG.computeKnownBits(Src, Depth + 1);
34824       return;
34825     }
34826     break;
34827   }
34828   case X86ISD::ANDNP: {
34829     KnownBits Known2;
34830     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34831     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34832 
34833     // ANDNP = (~X & Y);
34834     Known.One &= Known2.Zero;
34835     Known.Zero |= Known2.One;
34836     break;
34837   }
34838   case X86ISD::FOR: {
34839     KnownBits Known2;
34840     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34841     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34842 
34843     Known |= Known2;
34844     break;
34845   }
34846   case X86ISD::PSADBW: {
34847     assert(VT.getScalarType() == MVT::i64 &&
34848            Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
34849            "Unexpected PSADBW types");
34850 
34851     // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34852     Known.Zero.setBitsFrom(16);
34853     break;
34854   }
34855   case X86ISD::PMULUDQ: {
34856     KnownBits Known2;
34857     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34858     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34859 
34860     Known = Known.trunc(BitWidth / 2).zext(BitWidth);
34861     Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
34862     Known = KnownBits::mul(Known, Known2);
34863     break;
34864   }
34865   case X86ISD::CMOV: {
34866     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34867     // If we don't know any bits, early out.
34868     if (Known.isUnknown())
34869       break;
34870     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34871 
34872     // Only known if known in both the LHS and RHS.
34873     Known = KnownBits::commonBits(Known, Known2);
34874     break;
34875   }
34876   case X86ISD::BEXTR:
34877   case X86ISD::BEXTRI: {
34878     SDValue Op0 = Op.getOperand(0);
34879     SDValue Op1 = Op.getOperand(1);
34880 
34881     if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34882       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34883       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34884 
34885       // If the length is 0, the result is 0.
34886       if (Length == 0) {
34887         Known.setAllZero();
34888         break;
34889       }
34890 
34891       if ((Shift + Length) <= BitWidth) {
34892         Known = DAG.computeKnownBits(Op0, Depth + 1);
34893         Known = Known.extractBits(Length, Shift);
34894         Known = Known.zextOrTrunc(BitWidth);
34895       }
34896     }
34897     break;
34898   }
34899   case X86ISD::PDEP: {
34900     KnownBits Known2;
34901     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34902     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34903     // Zeros are retained from the mask operand. But not ones.
34904     Known.One.clearAllBits();
34905     // The result will have at least as many trailing zeros as the non-mask
34906     // operand since bits can only map to the same or higher bit position.
34907     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34908     break;
34909   }
34910   case X86ISD::PEXT: {
34911     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34912     // The result has as many leading zeros as the number of zeroes in the mask.
34913     unsigned Count = Known.Zero.countPopulation();
34914     Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34915     Known.One.clearAllBits();
34916     break;
34917   }
34918   case X86ISD::VTRUNC:
34919   case X86ISD::VTRUNCS:
34920   case X86ISD::VTRUNCUS:
34921   case X86ISD::CVTSI2P:
34922   case X86ISD::CVTUI2P:
34923   case X86ISD::CVTP2SI:
34924   case X86ISD::CVTP2UI:
34925   case X86ISD::MCVTP2SI:
34926   case X86ISD::MCVTP2UI:
34927   case X86ISD::CVTTP2SI:
34928   case X86ISD::CVTTP2UI:
34929   case X86ISD::MCVTTP2SI:
34930   case X86ISD::MCVTTP2UI:
34931   case X86ISD::MCVTSI2P:
34932   case X86ISD::MCVTUI2P:
34933   case X86ISD::VFPROUND:
34934   case X86ISD::VMFPROUND:
34935   case X86ISD::CVTPS2PH:
34936   case X86ISD::MCVTPS2PH: {
34937     // Truncations/Conversions - upper elements are known zero.
34938     EVT SrcVT = Op.getOperand(0).getValueType();
34939     if (SrcVT.isVector()) {
34940       unsigned NumSrcElts = SrcVT.getVectorNumElements();
34941       if (NumElts > NumSrcElts &&
34942           DemandedElts.countTrailingZeros() >= NumSrcElts)
34943         Known.setAllZero();
34944     }
34945     break;
34946   }
34947   case X86ISD::STRICT_CVTTP2SI:
34948   case X86ISD::STRICT_CVTTP2UI:
34949   case X86ISD::STRICT_CVTSI2P:
34950   case X86ISD::STRICT_CVTUI2P:
34951   case X86ISD::STRICT_VFPROUND:
34952   case X86ISD::STRICT_CVTPS2PH: {
34953     // Strict Conversions - upper elements are known zero.
34954     EVT SrcVT = Op.getOperand(1).getValueType();
34955     if (SrcVT.isVector()) {
34956       unsigned NumSrcElts = SrcVT.getVectorNumElements();
34957       if (NumElts > NumSrcElts &&
34958           DemandedElts.countTrailingZeros() >= NumSrcElts)
34959         Known.setAllZero();
34960     }
34961     break;
34962   }
34963   case X86ISD::MOVQ2DQ: {
34964     // Move from MMX to XMM. Upper half of XMM should be 0.
34965     if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34966       Known.setAllZero();
34967     break;
34968   }
34969   }
34970 
34971   // Handle target shuffles.
34972   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34973   if (isTargetShuffle(Opc)) {
34974     SmallVector<int, 64> Mask;
34975     SmallVector<SDValue, 2> Ops;
34976     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34977       unsigned NumOps = Ops.size();
34978       unsigned NumElts = VT.getVectorNumElements();
34979       if (Mask.size() == NumElts) {
34980         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34981         Known.Zero.setAllBits(); Known.One.setAllBits();
34982         for (unsigned i = 0; i != NumElts; ++i) {
34983           if (!DemandedElts[i])
34984             continue;
34985           int M = Mask[i];
34986           if (M == SM_SentinelUndef) {
34987             // For UNDEF elements, we don't know anything about the common state
34988             // of the shuffle result.
34989             Known.resetAll();
34990             break;
34991           }
34992           if (M == SM_SentinelZero) {
34993             Known.One.clearAllBits();
34994             continue;
34995           }
34996           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
34997                  "Shuffle index out of range");
34998 
34999           unsigned OpIdx = (unsigned)M / NumElts;
35000           unsigned EltIdx = (unsigned)M % NumElts;
35001           if (Ops[OpIdx].getValueType() != VT) {
35002             // TODO - handle target shuffle ops with different value types.
35003             Known.resetAll();
35004             break;
35005           }
35006           DemandedOps[OpIdx].setBit(EltIdx);
35007         }
35008         // Known bits are the values that are shared by every demanded element.
35009         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
35010           if (!DemandedOps[i])
35011             continue;
35012           KnownBits Known2 =
35013               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
35014           Known = KnownBits::commonBits(Known, Known2);
35015         }
35016       }
35017     }
35018   }
35019 }
35020 
35021 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
35022     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
35023     unsigned Depth) const {
35024   EVT VT = Op.getValueType();
35025   unsigned VTBits = VT.getScalarSizeInBits();
35026   unsigned Opcode = Op.getOpcode();
35027   switch (Opcode) {
35028   case X86ISD::SETCC_CARRY:
35029     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
35030     return VTBits;
35031 
35032   case X86ISD::VTRUNC: {
35033     SDValue Src = Op.getOperand(0);
35034     MVT SrcVT = Src.getSimpleValueType();
35035     unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
35036     assert(VTBits < NumSrcBits && "Illegal truncation input type");
35037     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
35038     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
35039     if (Tmp > (NumSrcBits - VTBits))
35040       return Tmp - (NumSrcBits - VTBits);
35041     return 1;
35042   }
35043 
35044   case X86ISD::PACKSS: {
35045     // PACKSS is just a truncation if the sign bits extend to the packed size.
35046     APInt DemandedLHS, DemandedRHS;
35047     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
35048                         DemandedRHS);
35049 
35050     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
35051     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
35052     if (!!DemandedLHS)
35053       Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35054     if (!!DemandedRHS)
35055       Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35056     unsigned Tmp = std::min(Tmp0, Tmp1);
35057     if (Tmp > (SrcBits - VTBits))
35058       return Tmp - (SrcBits - VTBits);
35059     return 1;
35060   }
35061 
35062   case X86ISD::VBROADCAST: {
35063     SDValue Src = Op.getOperand(0);
35064     if (!Src.getSimpleValueType().isVector())
35065       return DAG.ComputeNumSignBits(Src, Depth + 1);
35066     break;
35067   }
35068 
35069   case X86ISD::VSHLI: {
35070     SDValue Src = Op.getOperand(0);
35071     const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
35072     if (ShiftVal.uge(VTBits))
35073       return VTBits; // Shifted all bits out --> zero.
35074     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35075     if (ShiftVal.uge(Tmp))
35076       return 1; // Shifted all sign bits out --> unknown.
35077     return Tmp - ShiftVal.getZExtValue();
35078   }
35079 
35080   case X86ISD::VSRAI: {
35081     SDValue Src = Op.getOperand(0);
35082     APInt ShiftVal = Op.getConstantOperandAPInt(1);
35083     if (ShiftVal.uge(VTBits - 1))
35084       return VTBits; // Sign splat.
35085     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35086     ShiftVal += Tmp;
35087     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
35088   }
35089 
35090   case X86ISD::FSETCC:
35091     // cmpss/cmpsd return zero/all-bits result values in the bottom element.
35092     if (VT == MVT::f32 || VT == MVT::f64 ||
35093         ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
35094       return VTBits;
35095     break;
35096 
35097   case X86ISD::PCMPGT:
35098   case X86ISD::PCMPEQ:
35099   case X86ISD::CMPP:
35100   case X86ISD::VPCOM:
35101   case X86ISD::VPCOMU:
35102     // Vector compares return zero/all-bits result values.
35103     return VTBits;
35104 
35105   case X86ISD::ANDNP: {
35106     unsigned Tmp0 =
35107         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
35108     if (Tmp0 == 1) return 1; // Early out.
35109     unsigned Tmp1 =
35110         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
35111     return std::min(Tmp0, Tmp1);
35112   }
35113 
35114   case X86ISD::CMOV: {
35115     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
35116     if (Tmp0 == 1) return 1;  // Early out.
35117     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
35118     return std::min(Tmp0, Tmp1);
35119   }
35120   }
35121 
35122   // Handle target shuffles.
35123   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35124   if (isTargetShuffle(Opcode)) {
35125     SmallVector<int, 64> Mask;
35126     SmallVector<SDValue, 2> Ops;
35127     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35128       unsigned NumOps = Ops.size();
35129       unsigned NumElts = VT.getVectorNumElements();
35130       if (Mask.size() == NumElts) {
35131         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35132         for (unsigned i = 0; i != NumElts; ++i) {
35133           if (!DemandedElts[i])
35134             continue;
35135           int M = Mask[i];
35136           if (M == SM_SentinelUndef) {
35137             // For UNDEF elements, we don't know anything about the common state
35138             // of the shuffle result.
35139             return 1;
35140           } else if (M == SM_SentinelZero) {
35141             // Zero = all sign bits.
35142             continue;
35143           }
35144           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
35145                  "Shuffle index out of range");
35146 
35147           unsigned OpIdx = (unsigned)M / NumElts;
35148           unsigned EltIdx = (unsigned)M % NumElts;
35149           if (Ops[OpIdx].getValueType() != VT) {
35150             // TODO - handle target shuffle ops with different value types.
35151             return 1;
35152           }
35153           DemandedOps[OpIdx].setBit(EltIdx);
35154         }
35155         unsigned Tmp0 = VTBits;
35156         for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
35157           if (!DemandedOps[i])
35158             continue;
35159           unsigned Tmp1 =
35160               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
35161           Tmp0 = std::min(Tmp0, Tmp1);
35162         }
35163         return Tmp0;
35164       }
35165     }
35166   }
35167 
35168   // Fallback case.
35169   return 1;
35170 }
35171 
35172 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
35173   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
35174     return N->getOperand(0);
35175   return N;
35176 }
35177 
35178 // Helper to look for a normal load that can be narrowed into a vzload with the
35179 // specified VT and memory VT. Returns SDValue() on failure.
35180 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
35181                                   SelectionDAG &DAG) {
35182   // Can't if the load is volatile or atomic.
35183   if (!LN->isSimple())
35184     return SDValue();
35185 
35186   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35187   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
35188   return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
35189                                  LN->getPointerInfo(), LN->getOriginalAlign(),
35190                                  LN->getMemOperand()->getFlags());
35191 }
35192 
35193 // Attempt to match a combined shuffle mask against supported unary shuffle
35194 // instructions.
35195 // TODO: Investigate sharing more of this with shuffle lowering.
35196 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35197                               bool AllowFloatDomain, bool AllowIntDomain,
35198                               SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
35199                               const X86Subtarget &Subtarget, unsigned &Shuffle,
35200                               MVT &SrcVT, MVT &DstVT) {
35201   unsigned NumMaskElts = Mask.size();
35202   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35203 
35204   // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
35205   if (MaskEltSize == 32 && Mask[0] == 0) {
35206     if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
35207       Shuffle = X86ISD::VZEXT_MOVL;
35208       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35209       return true;
35210     }
35211     if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35212         isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35213       Shuffle = X86ISD::VZEXT_MOVL;
35214       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35215       return true;
35216     }
35217   }
35218 
35219   // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35220   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35221   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35222                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35223     unsigned MaxScale = 64 / MaskEltSize;
35224     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35225       bool MatchAny = true;
35226       bool MatchZero = true;
35227       unsigned NumDstElts = NumMaskElts / Scale;
35228       for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35229         if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35230           MatchAny = MatchZero = false;
35231           break;
35232         }
35233         MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35234         MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35235       }
35236       if (MatchAny || MatchZero) {
35237         assert(MatchZero && "Failed to match zext but matched aext?");
35238         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35239         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35240                                             MVT::getIntegerVT(MaskEltSize);
35241         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35242 
35243         if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35244           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35245 
35246         Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35247         if (SrcVT.getVectorNumElements() != NumDstElts)
35248           Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35249 
35250         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35251         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35252         return true;
35253       }
35254     }
35255   }
35256 
35257   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35258   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35259       isUndefOrEqual(Mask[0], 0) &&
35260       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35261     Shuffle = X86ISD::VZEXT_MOVL;
35262     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35263     return true;
35264   }
35265 
35266   // Check if we have SSE3 which will let us use MOVDDUP etc. The
35267   // instructions are no slower than UNPCKLPD but has the option to
35268   // fold the input operand into even an unaligned memory load.
35269   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35270     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35271       Shuffle = X86ISD::MOVDDUP;
35272       SrcVT = DstVT = MVT::v2f64;
35273       return true;
35274     }
35275     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35276       Shuffle = X86ISD::MOVSLDUP;
35277       SrcVT = DstVT = MVT::v4f32;
35278       return true;
35279     }
35280     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35281       Shuffle = X86ISD::MOVSHDUP;
35282       SrcVT = DstVT = MVT::v4f32;
35283       return true;
35284     }
35285   }
35286 
35287   if (MaskVT.is256BitVector() && AllowFloatDomain) {
35288     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
35289     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35290       Shuffle = X86ISD::MOVDDUP;
35291       SrcVT = DstVT = MVT::v4f64;
35292       return true;
35293     }
35294     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35295       Shuffle = X86ISD::MOVSLDUP;
35296       SrcVT = DstVT = MVT::v8f32;
35297       return true;
35298     }
35299     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35300       Shuffle = X86ISD::MOVSHDUP;
35301       SrcVT = DstVT = MVT::v8f32;
35302       return true;
35303     }
35304   }
35305 
35306   if (MaskVT.is512BitVector() && AllowFloatDomain) {
35307     assert(Subtarget.hasAVX512() &&
35308            "AVX512 required for 512-bit vector shuffles");
35309     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35310       Shuffle = X86ISD::MOVDDUP;
35311       SrcVT = DstVT = MVT::v8f64;
35312       return true;
35313     }
35314     if (isTargetShuffleEquivalent(
35315             MaskVT, Mask,
35316             {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35317       Shuffle = X86ISD::MOVSLDUP;
35318       SrcVT = DstVT = MVT::v16f32;
35319       return true;
35320     }
35321     if (isTargetShuffleEquivalent(
35322             MaskVT, Mask,
35323             {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35324       Shuffle = X86ISD::MOVSHDUP;
35325       SrcVT = DstVT = MVT::v16f32;
35326       return true;
35327     }
35328   }
35329 
35330   return false;
35331 }
35332 
35333 // Attempt to match a combined shuffle mask against supported unary immediate
35334 // permute instructions.
35335 // TODO: Investigate sharing more of this with shuffle lowering.
35336 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35337                                      const APInt &Zeroable,
35338                                      bool AllowFloatDomain, bool AllowIntDomain,
35339                                      const X86Subtarget &Subtarget,
35340                                      unsigned &Shuffle, MVT &ShuffleVT,
35341                                      unsigned &PermuteImm) {
35342   unsigned NumMaskElts = Mask.size();
35343   unsigned InputSizeInBits = MaskVT.getSizeInBits();
35344   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
35345   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
35346   bool ContainsZeros = isAnyZero(Mask);
35347 
35348   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
35349   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
35350     // Check for lane crossing permutes.
35351     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
35352       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
35353       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
35354         Shuffle = X86ISD::VPERMI;
35355         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
35356         PermuteImm = getV4X86ShuffleImm(Mask);
35357         return true;
35358       }
35359       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
35360         SmallVector<int, 4> RepeatedMask;
35361         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
35362           Shuffle = X86ISD::VPERMI;
35363           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
35364           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
35365           return true;
35366         }
35367       }
35368     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
35369       // VPERMILPD can permute with a non-repeating shuffle.
35370       Shuffle = X86ISD::VPERMILPI;
35371       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
35372       PermuteImm = 0;
35373       for (int i = 0, e = Mask.size(); i != e; ++i) {
35374         int M = Mask[i];
35375         if (M == SM_SentinelUndef)
35376           continue;
35377         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
35378         PermuteImm |= (M & 1) << i;
35379       }
35380       return true;
35381     }
35382   }
35383 
35384   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
35385   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
35386   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
35387   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
35388       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
35389     SmallVector<int, 4> RepeatedMask;
35390     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35391       // Narrow the repeated mask to create 32-bit element permutes.
35392       SmallVector<int, 4> WordMask = RepeatedMask;
35393       if (MaskScalarSizeInBits == 64)
35394         narrowShuffleMaskElts(2, RepeatedMask, WordMask);
35395 
35396       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
35397       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
35398       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
35399       PermuteImm = getV4X86ShuffleImm(WordMask);
35400       return true;
35401     }
35402   }
35403 
35404   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
35405   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
35406       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35407        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35408        (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35409     SmallVector<int, 4> RepeatedMask;
35410     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35411       ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
35412       ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
35413 
35414       // PSHUFLW: permute lower 4 elements only.
35415       if (isUndefOrInRange(LoMask, 0, 4) &&
35416           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
35417         Shuffle = X86ISD::PSHUFLW;
35418         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35419         PermuteImm = getV4X86ShuffleImm(LoMask);
35420         return true;
35421       }
35422 
35423       // PSHUFHW: permute upper 4 elements only.
35424       if (isUndefOrInRange(HiMask, 4, 8) &&
35425           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
35426         // Offset the HiMask so that we can create the shuffle immediate.
35427         int OffsetHiMask[4];
35428         for (int i = 0; i != 4; ++i)
35429           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
35430 
35431         Shuffle = X86ISD::PSHUFHW;
35432         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35433         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
35434         return true;
35435       }
35436     }
35437   }
35438 
35439   // Attempt to match against byte/bit shifts.
35440   if (AllowIntDomain &&
35441       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35442        (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35443        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35444     int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
35445                                        Mask, 0, Zeroable, Subtarget);
35446     if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
35447                          32 <= ShuffleVT.getScalarSizeInBits())) {
35448       PermuteImm = (unsigned)ShiftAmt;
35449       return true;
35450     }
35451   }
35452 
35453   // Attempt to match against bit rotates.
35454   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
35455       ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
35456        Subtarget.hasAVX512())) {
35457     int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
35458                                             Subtarget, Mask);
35459     if (0 < RotateAmt) {
35460       Shuffle = X86ISD::VROTLI;
35461       PermuteImm = (unsigned)RotateAmt;
35462       return true;
35463     }
35464   }
35465 
35466   return false;
35467 }
35468 
35469 // Attempt to match a combined unary shuffle mask against supported binary
35470 // shuffle instructions.
35471 // TODO: Investigate sharing more of this with shuffle lowering.
35472 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35473                                bool AllowFloatDomain, bool AllowIntDomain,
35474                                SDValue &V1, SDValue &V2, const SDLoc &DL,
35475                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
35476                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
35477                                bool IsUnary) {
35478   unsigned NumMaskElts = Mask.size();
35479   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35480 
35481   if (MaskVT.is128BitVector()) {
35482     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
35483       V2 = V1;
35484       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
35485       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
35486       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35487       return true;
35488     }
35489     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
35490       V2 = V1;
35491       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
35492       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35493       return true;
35494     }
35495     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
35496         Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
35497       std::swap(V1, V2);
35498       Shuffle = X86ISD::MOVSD;
35499       SrcVT = DstVT = MVT::v2f64;
35500       return true;
35501     }
35502     if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
35503         (AllowFloatDomain || !Subtarget.hasSSE41())) {
35504       Shuffle = X86ISD::MOVSS;
35505       SrcVT = DstVT = MVT::v4f32;
35506       return true;
35507     }
35508   }
35509 
35510   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
35511   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
35512       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
35513       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
35514     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
35515                              Subtarget)) {
35516       DstVT = MaskVT;
35517       return true;
35518     }
35519   }
35520 
35521   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
35522   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
35523       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35524       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
35525       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35526       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
35527     if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
35528                               Subtarget)) {
35529       SrcVT = DstVT = MaskVT;
35530       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
35531         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
35532       return true;
35533     }
35534   }
35535 
35536   // Attempt to match against a OR if we're performing a blend shuffle and the
35537   // non-blended source element is zero in each case.
35538   if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35539       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
35540     bool IsBlend = true;
35541     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
35542     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
35543     unsigned Scale1 = NumV1Elts / NumMaskElts;
35544     unsigned Scale2 = NumV2Elts / NumMaskElts;
35545     APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
35546     APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
35547     for (unsigned i = 0; i != NumMaskElts; ++i) {
35548       int M = Mask[i];
35549       if (M == SM_SentinelUndef)
35550         continue;
35551       if (M == SM_SentinelZero) {
35552         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35553         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35554         continue;
35555       }
35556       if (M == (int)i) {
35557         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35558         continue;
35559       }
35560       if (M == (int)(i + NumMaskElts)) {
35561         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35562         continue;
35563       }
35564       IsBlend = false;
35565       break;
35566     }
35567     if (IsBlend &&
35568         DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
35569         DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
35570       Shuffle = ISD::OR;
35571       SrcVT = DstVT = MaskVT.changeTypeToInteger();
35572       return true;
35573     }
35574   }
35575 
35576   return false;
35577 }
35578 
35579 static bool matchBinaryPermuteShuffle(
35580     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
35581     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
35582     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
35583     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
35584   unsigned NumMaskElts = Mask.size();
35585   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35586 
35587   // Attempt to match against VALIGND/VALIGNQ rotate.
35588   if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
35589       ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
35590        (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
35591        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35592     if (!isAnyZero(Mask)) {
35593       int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
35594       if (0 < Rotation) {
35595         Shuffle = X86ISD::VALIGN;
35596         if (EltSizeInBits == 64)
35597           ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
35598         else
35599           ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
35600         PermuteImm = Rotation;
35601         return true;
35602       }
35603     }
35604   }
35605 
35606   // Attempt to match against PALIGNR byte rotate.
35607   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35608                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35609                          (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35610     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
35611     if (0 < ByteRotation) {
35612       Shuffle = X86ISD::PALIGNR;
35613       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
35614       PermuteImm = ByteRotation;
35615       return true;
35616     }
35617   }
35618 
35619   // Attempt to combine to X86ISD::BLENDI.
35620   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
35621                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
35622       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
35623     uint64_t BlendMask = 0;
35624     bool ForceV1Zero = false, ForceV2Zero = false;
35625     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
35626     if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
35627                             ForceV2Zero, BlendMask)) {
35628       if (MaskVT == MVT::v16i16) {
35629         // We can only use v16i16 PBLENDW if the lanes are repeated.
35630         SmallVector<int, 8> RepeatedMask;
35631         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
35632                                         RepeatedMask)) {
35633           assert(RepeatedMask.size() == 8 &&
35634                  "Repeated mask size doesn't match!");
35635           PermuteImm = 0;
35636           for (int i = 0; i < 8; ++i)
35637             if (RepeatedMask[i] >= 8)
35638               PermuteImm |= 1 << i;
35639           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35640           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35641           Shuffle = X86ISD::BLENDI;
35642           ShuffleVT = MaskVT;
35643           return true;
35644         }
35645       } else {
35646         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35647         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35648         PermuteImm = (unsigned)BlendMask;
35649         Shuffle = X86ISD::BLENDI;
35650         ShuffleVT = MaskVT;
35651         return true;
35652       }
35653     }
35654   }
35655 
35656   // Attempt to combine to INSERTPS, but only if it has elements that need to
35657   // be set to zero.
35658   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35659       MaskVT.is128BitVector() && isAnyZero(Mask) &&
35660       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35661     Shuffle = X86ISD::INSERTPS;
35662     ShuffleVT = MVT::v4f32;
35663     return true;
35664   }
35665 
35666   // Attempt to combine to SHUFPD.
35667   if (AllowFloatDomain && EltSizeInBits == 64 &&
35668       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35669        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35670        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35671     bool ForceV1Zero = false, ForceV2Zero = false;
35672     if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
35673                                PermuteImm, Mask, Zeroable)) {
35674       V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35675       V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35676       Shuffle = X86ISD::SHUFP;
35677       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
35678       return true;
35679     }
35680   }
35681 
35682   // Attempt to combine to SHUFPS.
35683   if (AllowFloatDomain && EltSizeInBits == 32 &&
35684       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
35685        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35686        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35687     SmallVector<int, 4> RepeatedMask;
35688     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
35689       // Match each half of the repeated mask, to determine if its just
35690       // referencing one of the vectors, is zeroable or entirely undef.
35691       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
35692         int M0 = RepeatedMask[Offset];
35693         int M1 = RepeatedMask[Offset + 1];
35694 
35695         if (isUndefInRange(RepeatedMask, Offset, 2)) {
35696           return DAG.getUNDEF(MaskVT);
35697         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
35698           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
35699           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35700           return getZeroVector(MaskVT, Subtarget, DAG, DL);
35701         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35702           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35703           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35704           return V1;
35705         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35706           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35707           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35708           return V2;
35709         }
35710 
35711         return SDValue();
35712       };
35713 
35714       int ShufMask[4] = {-1, -1, -1, -1};
35715       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35716       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35717 
35718       if (Lo && Hi) {
35719         V1 = Lo;
35720         V2 = Hi;
35721         Shuffle = X86ISD::SHUFP;
35722         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35723         PermuteImm = getV4X86ShuffleImm(ShufMask);
35724         return true;
35725       }
35726     }
35727   }
35728 
35729   // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35730   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35731       MaskVT.is128BitVector() &&
35732       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35733     Shuffle = X86ISD::INSERTPS;
35734     ShuffleVT = MVT::v4f32;
35735     return true;
35736   }
35737 
35738   return false;
35739 }
35740 
35741 static SDValue combineX86ShuffleChainWithExtract(
35742     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35743     bool HasVariableMask, bool AllowVariableCrossLaneMask,
35744     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
35745     const X86Subtarget &Subtarget);
35746 
35747 /// Combine an arbitrary chain of shuffles into a single instruction if
35748 /// possible.
35749 ///
35750 /// This is the leaf of the recursive combine below. When we have found some
35751 /// chain of single-use x86 shuffle instructions and accumulated the combined
35752 /// shuffle mask represented by them, this will try to pattern match that mask
35753 /// into either a single instruction if there is a special purpose instruction
35754 /// for this operation, or into a PSHUFB instruction which is a fully general
35755 /// instruction but should only be used to replace chains over a certain depth.
35756 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35757                                       ArrayRef<int> BaseMask, int Depth,
35758                                       bool HasVariableMask,
35759                                       bool AllowVariableCrossLaneMask,
35760                                       bool AllowVariablePerLaneMask,
35761                                       SelectionDAG &DAG,
35762                                       const X86Subtarget &Subtarget) {
35763   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
35764   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
35765          "Unexpected number of shuffle inputs!");
35766 
35767   MVT RootVT = Root.getSimpleValueType();
35768   unsigned RootSizeInBits = RootVT.getSizeInBits();
35769   unsigned NumRootElts = RootVT.getVectorNumElements();
35770 
35771   // Canonicalize shuffle input op to the requested type.
35772   // TODO: Support cases where Op is smaller than VT.
35773   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35774     return DAG.getBitcast(VT, Op);
35775   };
35776 
35777   // Find the inputs that enter the chain. Note that multiple uses are OK
35778   // here, we're not going to remove the operands we find.
35779   bool UnaryShuffle = (Inputs.size() == 1);
35780   SDValue V1 = peekThroughBitcasts(Inputs[0]);
35781   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35782                              : peekThroughBitcasts(Inputs[1]));
35783 
35784   MVT VT1 = V1.getSimpleValueType();
35785   MVT VT2 = V2.getSimpleValueType();
35786   assert(VT1.getSizeInBits() == RootSizeInBits &&
35787          VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
35788 
35789   SDLoc DL(Root);
35790   SDValue Res;
35791 
35792   unsigned NumBaseMaskElts = BaseMask.size();
35793   if (NumBaseMaskElts == 1) {
35794     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
35795     return CanonicalizeShuffleInput(RootVT, V1);
35796   }
35797 
35798   bool OptForSize = DAG.shouldOptForSize();
35799   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35800   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35801                      (RootVT.isFloatingPoint() && Depth >= 1) ||
35802                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35803 
35804   // Don't combine if we are a AVX512/EVEX target and the mask element size
35805   // is different from the root element size - this would prevent writemasks
35806   // from being reused.
35807   bool IsMaskedShuffle = false;
35808   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35809     if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35810         Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35811       IsMaskedShuffle = true;
35812     }
35813   }
35814 
35815   // If we are shuffling a broadcast (and not introducing zeros) then
35816   // we can just use the broadcast directly. This works for smaller broadcast
35817   // elements as well as they already repeat across each mask element
35818   if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35819       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35820       V1.getValueSizeInBits() >= RootSizeInBits) {
35821     return CanonicalizeShuffleInput(RootVT, V1);
35822   }
35823 
35824   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35825   // etc. can be simplified.
35826   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
35827     SmallVector<int> ScaledMask, IdentityMask;
35828     unsigned NumElts = VT1.getVectorNumElements();
35829     if (BaseMask.size() <= NumElts &&
35830         scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
35831       for (unsigned i = 0; i != NumElts; ++i)
35832         IdentityMask.push_back(i);
35833       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
35834         return CanonicalizeShuffleInput(RootVT, V1);
35835     }
35836   }
35837 
35838   // Handle 128/256-bit lane shuffles of 512-bit vectors.
35839   if (RootVT.is512BitVector() &&
35840       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35841     // If the upper subvectors are zeroable, then an extract+insert is more
35842     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35843     // to zero the upper subvectors.
35844     if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35845       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35846         return SDValue(); // Nothing to do!
35847       assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
35848              "Unexpected lane shuffle");
35849       Res = CanonicalizeShuffleInput(RootVT, V1);
35850       unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35851       bool UseZero = isAnyZero(BaseMask);
35852       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35853       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35854     }
35855 
35856     // Narrow shuffle mask to v4x128.
35857     SmallVector<int, 4> Mask;
35858     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35859     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35860 
35861     // Try to lower to vshuf64x2/vshuf32x4.
35862     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35863                             SDValue V1, SDValue V2, SelectionDAG &DAG) {
35864       unsigned PermMask = 0;
35865       // Insure elements came from the same Op.
35866       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35867       for (int i = 0; i < 4; ++i) {
35868         assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
35869         if (Mask[i] < 0)
35870           continue;
35871 
35872         SDValue Op = Mask[i] >= 4 ? V2 : V1;
35873         unsigned OpIndex = i / 2;
35874         if (Ops[OpIndex].isUndef())
35875           Ops[OpIndex] = Op;
35876         else if (Ops[OpIndex] != Op)
35877           return SDValue();
35878 
35879         // Convert the 128-bit shuffle mask selection values into 128-bit
35880         // selection bits defined by a vshuf64x2 instruction's immediate control
35881         // byte.
35882         PermMask |= (Mask[i] % 4) << (i * 2);
35883       }
35884 
35885       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35886                          CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35887                          CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35888                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
35889     };
35890 
35891     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35892     // doesn't work because our mask is for 128 bits and we don't have an MVT
35893     // to match that.
35894     bool PreferPERMQ =
35895         UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35896         isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35897         isUndefOrInRange(Mask[3], 2, 4) &&
35898         (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35899         (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35900 
35901     if (!isAnyZero(Mask) && !PreferPERMQ) {
35902       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35903         return SDValue(); // Nothing to do!
35904       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35905       if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35906         return DAG.getBitcast(RootVT, V);
35907     }
35908   }
35909 
35910   // Handle 128-bit lane shuffles of 256-bit vectors.
35911   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35912     // If the upper half is zeroable, then an extract+insert is more optimal
35913     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35914     // zero the upper half.
35915     if (isUndefOrZero(BaseMask[1])) {
35916       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35917         return SDValue(); // Nothing to do!
35918       assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
35919       Res = CanonicalizeShuffleInput(RootVT, V1);
35920       Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35921       return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35922                             DL, 256);
35923     }
35924 
35925     // If we're splatting the low subvector, an insert-subvector 'concat'
35926     // pattern is quicker than VPERM2X128.
35927     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35928     if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
35929       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35930         return SDValue(); // Nothing to do!
35931       Res = CanonicalizeShuffleInput(RootVT, V1);
35932       Res = extractSubVector(Res, 0, DAG, DL, 128);
35933       return concatSubVectors(Res, Res, DAG, DL);
35934     }
35935 
35936     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35937       return SDValue(); // Nothing to do!
35938 
35939     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35940     // we need to use the zeroing feature.
35941     // Prefer blends for sequential shuffles unless we are optimizing for size.
35942     if (UnaryShuffle &&
35943         !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35944         (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35945       unsigned PermMask = 0;
35946       PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35947       PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35948       return DAG.getNode(
35949           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35950           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
35951     }
35952 
35953     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35954       return SDValue(); // Nothing to do!
35955 
35956     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35957     if (!UnaryShuffle && !IsMaskedShuffle) {
35958       assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
35959              "Unexpected shuffle sentinel value");
35960       // Prefer blends to X86ISD::VPERM2X128.
35961       if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35962             (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35963         unsigned PermMask = 0;
35964         PermMask |= ((BaseMask[0] & 3) << 0);
35965         PermMask |= ((BaseMask[1] & 3) << 4);
35966         SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35967         SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35968         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35969                           CanonicalizeShuffleInput(RootVT, LHS),
35970                           CanonicalizeShuffleInput(RootVT, RHS),
35971                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
35972       }
35973     }
35974   }
35975 
35976   // For masks that have been widened to 128-bit elements or more,
35977   // narrow back down to 64-bit elements.
35978   SmallVector<int, 64> Mask;
35979   if (BaseMaskEltSizeInBits > 64) {
35980     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
35981     int MaskScale = BaseMaskEltSizeInBits / 64;
35982     narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35983   } else {
35984     Mask.assign(BaseMask.begin(), BaseMask.end());
35985   }
35986 
35987   // For masked shuffles, we're trying to match the root width for better
35988   // writemask folding, attempt to scale the mask.
35989   // TODO - variable shuffles might need this to be widened again.
35990   if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35991     assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
35992     int MaskScale = NumRootElts / Mask.size();
35993     SmallVector<int, 64> ScaledMask;
35994     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
35995     Mask = std::move(ScaledMask);
35996   }
35997 
35998   unsigned NumMaskElts = Mask.size();
35999   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
36000 
36001   // Determine the effective mask value type.
36002   FloatDomain &= (32 <= MaskEltSizeInBits);
36003   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
36004                            : MVT::getIntegerVT(MaskEltSizeInBits);
36005   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
36006 
36007   // Only allow legal mask types.
36008   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36009     return SDValue();
36010 
36011   // Attempt to match the mask against known shuffle patterns.
36012   MVT ShuffleSrcVT, ShuffleVT;
36013   unsigned Shuffle, PermuteImm;
36014 
36015   // Which shuffle domains are permitted?
36016   // Permit domain crossing at higher combine depths.
36017   // TODO: Should we indicate which domain is preferred if both are allowed?
36018   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
36019   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
36020                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
36021 
36022   // Determine zeroable mask elements.
36023   APInt KnownUndef, KnownZero;
36024   resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
36025   APInt Zeroable = KnownUndef | KnownZero;
36026 
36027   if (UnaryShuffle) {
36028     // Attempt to match against broadcast-from-vector.
36029     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
36030     if ((Subtarget.hasAVX2() ||
36031          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
36032         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
36033       if (isUndefOrEqual(Mask, 0)) {
36034         if (V1.getValueType() == MaskVT &&
36035             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36036             MayFoldLoad(V1.getOperand(0))) {
36037           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36038             return SDValue(); // Nothing to do!
36039           Res = V1.getOperand(0);
36040           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36041           return DAG.getBitcast(RootVT, Res);
36042         }
36043         if (Subtarget.hasAVX2()) {
36044           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36045             return SDValue(); // Nothing to do!
36046           Res = CanonicalizeShuffleInput(MaskVT, V1);
36047           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36048           return DAG.getBitcast(RootVT, Res);
36049         }
36050       }
36051     }
36052 
36053     SDValue NewV1 = V1; // Save operand in case early exit happens.
36054     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36055                           DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36056                           ShuffleVT) &&
36057         (!IsMaskedShuffle ||
36058          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36059       if (Depth == 0 && Root.getOpcode() == Shuffle)
36060         return SDValue(); // Nothing to do!
36061       Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36062       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
36063       return DAG.getBitcast(RootVT, Res);
36064     }
36065 
36066     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36067                                  AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
36068                                  PermuteImm) &&
36069         (!IsMaskedShuffle ||
36070          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36071       if (Depth == 0 && Root.getOpcode() == Shuffle)
36072         return SDValue(); // Nothing to do!
36073       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
36074       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
36075                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36076       return DAG.getBitcast(RootVT, Res);
36077     }
36078   }
36079 
36080   // Attempt to combine to INSERTPS, but only if the inserted element has come
36081   // from a scalar.
36082   // TODO: Handle other insertions here as well?
36083   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
36084       Subtarget.hasSSE41() &&
36085       !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
36086     if (MaskEltSizeInBits == 32) {
36087       SDValue SrcV1 = V1, SrcV2 = V2;
36088       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
36089                                  DAG) &&
36090           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36091         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36092           return SDValue(); // Nothing to do!
36093         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36094                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
36095                           CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
36096                           DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36097         return DAG.getBitcast(RootVT, Res);
36098       }
36099     }
36100     if (MaskEltSizeInBits == 64 &&
36101         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
36102         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36103         V2.getScalarValueSizeInBits() <= 32) {
36104       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36105         return SDValue(); // Nothing to do!
36106       PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
36107       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36108                         CanonicalizeShuffleInput(MVT::v4f32, V1),
36109                         CanonicalizeShuffleInput(MVT::v4f32, V2),
36110                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36111       return DAG.getBitcast(RootVT, Res);
36112     }
36113   }
36114 
36115   SDValue NewV1 = V1; // Save operands in case early exit happens.
36116   SDValue NewV2 = V2;
36117   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36118                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36119                          ShuffleVT, UnaryShuffle) &&
36120       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36121     if (Depth == 0 && Root.getOpcode() == Shuffle)
36122       return SDValue(); // Nothing to do!
36123     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36124     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
36125     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
36126     return DAG.getBitcast(RootVT, Res);
36127   }
36128 
36129   NewV1 = V1; // Save operands in case early exit happens.
36130   NewV2 = V2;
36131   if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36132                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
36133                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
36134       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36135     if (Depth == 0 && Root.getOpcode() == Shuffle)
36136       return SDValue(); // Nothing to do!
36137     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
36138     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
36139     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
36140                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36141     return DAG.getBitcast(RootVT, Res);
36142   }
36143 
36144   // Typically from here on, we need an integer version of MaskVT.
36145   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
36146   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
36147 
36148   // Annoyingly, SSE4A instructions don't map into the above match helpers.
36149   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
36150     uint64_t BitLen, BitIdx;
36151     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
36152                             Zeroable)) {
36153       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
36154         return SDValue(); // Nothing to do!
36155       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36156       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
36157                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
36158                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36159       return DAG.getBitcast(RootVT, Res);
36160     }
36161 
36162     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
36163       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
36164         return SDValue(); // Nothing to do!
36165       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36166       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
36167       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
36168                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
36169                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36170       return DAG.getBitcast(RootVT, Res);
36171     }
36172   }
36173 
36174   // Match shuffle against TRUNCATE patterns.
36175   if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
36176     // Match against a VTRUNC instruction, accounting for src/dst sizes.
36177     if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
36178                              Subtarget)) {
36179       bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
36180                         ShuffleSrcVT.getVectorNumElements();
36181       unsigned Opc =
36182           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
36183       if (Depth == 0 && Root.getOpcode() == Opc)
36184         return SDValue(); // Nothing to do!
36185       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36186       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
36187       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
36188         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
36189       return DAG.getBitcast(RootVT, Res);
36190     }
36191 
36192     // Do we need a more general binary truncation pattern?
36193     if (RootSizeInBits < 512 &&
36194         ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
36195          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
36196         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
36197         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
36198       if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
36199         return SDValue(); // Nothing to do!
36200       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36201       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
36202       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36203       V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
36204       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36205       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36206       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36207       Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36208       return DAG.getBitcast(RootVT, Res);
36209     }
36210   }
36211 
36212   // Don't try to re-form single instruction chains under any circumstances now
36213   // that we've done encoding canonicalization for them.
36214   if (Depth < 1)
36215     return SDValue();
36216 
36217   // Depth threshold above which we can efficiently use variable mask shuffles.
36218   int VariableCrossLaneShuffleDepth =
36219       Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
36220   int VariablePerLaneShuffleDepth =
36221       Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
36222   AllowVariableCrossLaneMask &=
36223       (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
36224   AllowVariablePerLaneMask &=
36225       (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
36226   // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36227   // higher depth before combining them.
36228   bool AllowBWIVPERMV3 =
36229       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
36230 
36231   bool MaskContainsZeros = isAnyZero(Mask);
36232 
36233   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36234     // If we have a single input lane-crossing shuffle then lower to VPERMV.
36235     if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
36236       if (Subtarget.hasAVX2() &&
36237           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36238         SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36239         Res = CanonicalizeShuffleInput(MaskVT, V1);
36240         Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36241         return DAG.getBitcast(RootVT, Res);
36242       }
36243       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36244       if ((Subtarget.hasAVX512() &&
36245            (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36246             MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36247           (Subtarget.hasBWI() &&
36248            (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36249           (Subtarget.hasVBMI() &&
36250            (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36251         V1 = CanonicalizeShuffleInput(MaskVT, V1);
36252         V2 = DAG.getUNDEF(MaskVT);
36253         Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36254         return DAG.getBitcast(RootVT, Res);
36255       }
36256     }
36257 
36258     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36259     // vector as the second source (non-VLX will pad to 512-bit shuffles).
36260     if (UnaryShuffle && AllowVariableCrossLaneMask &&
36261         ((Subtarget.hasAVX512() &&
36262           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36263            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36264            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36265            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36266          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36267           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36268          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36269           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36270       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36271       for (unsigned i = 0; i != NumMaskElts; ++i)
36272         if (Mask[i] == SM_SentinelZero)
36273           Mask[i] = NumMaskElts + i;
36274       V1 = CanonicalizeShuffleInput(MaskVT, V1);
36275       V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36276       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36277       return DAG.getBitcast(RootVT, Res);
36278     }
36279 
36280     // If that failed and either input is extracted then try to combine as a
36281     // shuffle with the larger type.
36282     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36283             Inputs, Root, BaseMask, Depth, HasVariableMask,
36284             AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
36285             Subtarget))
36286       return WideShuffle;
36287 
36288     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36289     // (non-VLX will pad to 512-bit shuffles).
36290     if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
36291         ((Subtarget.hasAVX512() &&
36292           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36293            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36294            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36295            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36296          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36297           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36298          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36299           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36300       V1 = CanonicalizeShuffleInput(MaskVT, V1);
36301       V2 = CanonicalizeShuffleInput(MaskVT, V2);
36302       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36303       return DAG.getBitcast(RootVT, Res);
36304     }
36305     return SDValue();
36306   }
36307 
36308   // See if we can combine a single input shuffle with zeros to a bit-mask,
36309   // which is much simpler than any shuffle.
36310   if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
36311       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36312       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36313     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36314     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36315     APInt UndefElts(NumMaskElts, 0);
36316     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36317     for (unsigned i = 0; i != NumMaskElts; ++i) {
36318       int M = Mask[i];
36319       if (M == SM_SentinelUndef) {
36320         UndefElts.setBit(i);
36321         continue;
36322       }
36323       if (M == SM_SentinelZero)
36324         continue;
36325       EltBits[i] = AllOnes;
36326     }
36327     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36328     Res = CanonicalizeShuffleInput(MaskVT, V1);
36329     unsigned AndOpcode =
36330         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36331     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36332     return DAG.getBitcast(RootVT, Res);
36333   }
36334 
36335   // If we have a single input shuffle with different shuffle patterns in the
36336   // the 128-bit lanes use the variable mask to VPERMILPS.
36337   // TODO Combine other mask types at higher depths.
36338   if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36339       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
36340        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
36341     SmallVector<SDValue, 16> VPermIdx;
36342     for (int M : Mask) {
36343       SDValue Idx =
36344           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
36345       VPermIdx.push_back(Idx);
36346     }
36347     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
36348     Res = CanonicalizeShuffleInput(MaskVT, V1);
36349     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
36350     return DAG.getBitcast(RootVT, Res);
36351   }
36352 
36353   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
36354   // to VPERMIL2PD/VPERMIL2PS.
36355   if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
36356       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
36357        MaskVT == MVT::v8f32)) {
36358     // VPERMIL2 Operation.
36359     // Bits[3] - Match Bit.
36360     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
36361     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
36362     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
36363     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
36364     SmallVector<int, 8> VPerm2Idx;
36365     unsigned M2ZImm = 0;
36366     for (int M : Mask) {
36367       if (M == SM_SentinelUndef) {
36368         VPerm2Idx.push_back(-1);
36369         continue;
36370       }
36371       if (M == SM_SentinelZero) {
36372         M2ZImm = 2;
36373         VPerm2Idx.push_back(8);
36374         continue;
36375       }
36376       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
36377       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
36378       VPerm2Idx.push_back(Index);
36379     }
36380     V1 = CanonicalizeShuffleInput(MaskVT, V1);
36381     V2 = CanonicalizeShuffleInput(MaskVT, V2);
36382     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
36383     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
36384                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
36385     return DAG.getBitcast(RootVT, Res);
36386   }
36387 
36388   // If we have 3 or more shuffle instructions or a chain involving a variable
36389   // mask, we can replace them with a single PSHUFB instruction profitably.
36390   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
36391   // instructions, but in practice PSHUFB tends to be *very* fast so we're
36392   // more aggressive.
36393   if (UnaryShuffle && AllowVariablePerLaneMask &&
36394       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36395        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
36396        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
36397     SmallVector<SDValue, 16> PSHUFBMask;
36398     int NumBytes = RootVT.getSizeInBits() / 8;
36399     int Ratio = NumBytes / NumMaskElts;
36400     for (int i = 0; i < NumBytes; ++i) {
36401       int M = Mask[i / Ratio];
36402       if (M == SM_SentinelUndef) {
36403         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
36404         continue;
36405       }
36406       if (M == SM_SentinelZero) {
36407         PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36408         continue;
36409       }
36410       M = Ratio * M + i % Ratio;
36411       assert((M / 16) == (i / 16) && "Lane crossing detected");
36412       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36413     }
36414     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
36415     Res = CanonicalizeShuffleInput(ByteVT, V1);
36416     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
36417     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
36418     return DAG.getBitcast(RootVT, Res);
36419   }
36420 
36421   // With XOP, if we have a 128-bit binary input shuffle we can always combine
36422   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
36423   // slower than PSHUFB on targets that support both.
36424   if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
36425       Subtarget.hasXOP()) {
36426     // VPPERM Mask Operation
36427     // Bits[4:0] - Byte Index (0 - 31)
36428     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
36429     SmallVector<SDValue, 16> VPPERMMask;
36430     int NumBytes = 16;
36431     int Ratio = NumBytes / NumMaskElts;
36432     for (int i = 0; i < NumBytes; ++i) {
36433       int M = Mask[i / Ratio];
36434       if (M == SM_SentinelUndef) {
36435         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
36436         continue;
36437       }
36438       if (M == SM_SentinelZero) {
36439         VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36440         continue;
36441       }
36442       M = Ratio * M + i % Ratio;
36443       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36444     }
36445     MVT ByteVT = MVT::v16i8;
36446     V1 = CanonicalizeShuffleInput(ByteVT, V1);
36447     V2 = CanonicalizeShuffleInput(ByteVT, V2);
36448     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
36449     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
36450     return DAG.getBitcast(RootVT, Res);
36451   }
36452 
36453   // If that failed and either input is extracted then try to combine as a
36454   // shuffle with the larger type.
36455   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36456           Inputs, Root, BaseMask, Depth, HasVariableMask,
36457           AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
36458     return WideShuffle;
36459 
36460   // If we have a dual input shuffle then lower to VPERMV3,
36461   // (non-VLX will pad to 512-bit shuffles)
36462   if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36463       ((Subtarget.hasAVX512() &&
36464         (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
36465          MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
36466          MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
36467          MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
36468          MaskVT == MVT::v16i32)) ||
36469        (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36470         (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
36471          MaskVT == MVT::v32i16)) ||
36472        (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36473         (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
36474          MaskVT == MVT::v64i8)))) {
36475     V1 = CanonicalizeShuffleInput(MaskVT, V1);
36476     V2 = CanonicalizeShuffleInput(MaskVT, V2);
36477     Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36478     return DAG.getBitcast(RootVT, Res);
36479   }
36480 
36481   // Failed to find any combines.
36482   return SDValue();
36483 }
36484 
36485 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
36486 // instruction if possible.
36487 //
36488 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
36489 // type size to attempt to combine:
36490 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
36491 // -->
36492 // extract_subvector(shuffle(x,y,m2),0)
36493 static SDValue combineX86ShuffleChainWithExtract(
36494     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36495     bool HasVariableMask, bool AllowVariableCrossLaneMask,
36496     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36497     const X86Subtarget &Subtarget) {
36498   unsigned NumMaskElts = BaseMask.size();
36499   unsigned NumInputs = Inputs.size();
36500   if (NumInputs == 0)
36501     return SDValue();
36502 
36503   EVT RootVT = Root.getValueType();
36504   unsigned RootSizeInBits = RootVT.getSizeInBits();
36505   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
36506 
36507   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
36508   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
36509 
36510   // Peek through subvectors.
36511   // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
36512   unsigned WideSizeInBits = RootSizeInBits;
36513   for (unsigned i = 0; i != NumInputs; ++i) {
36514     SDValue &Src = WideInputs[i];
36515     unsigned &Offset = Offsets[i];
36516     Src = peekThroughBitcasts(Src);
36517     EVT BaseVT = Src.getValueType();
36518     while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
36519       Offset += Src.getConstantOperandVal(1);
36520       Src = Src.getOperand(0);
36521     }
36522     WideSizeInBits = std::max(WideSizeInBits,
36523                               (unsigned)Src.getValueSizeInBits());
36524     assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
36525            "Unexpected subvector extraction");
36526     Offset /= BaseVT.getVectorNumElements();
36527     Offset *= NumMaskElts;
36528   }
36529 
36530   // Bail if we're always extracting from the lowest subvectors,
36531   // combineX86ShuffleChain should match this for the current width.
36532   if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
36533     return SDValue();
36534 
36535   unsigned Scale = WideSizeInBits / RootSizeInBits;
36536   assert((WideSizeInBits % RootSizeInBits) == 0 &&
36537          "Unexpected subvector extraction");
36538 
36539   // If the src vector types aren't the same, see if we can extend
36540   // them to match each other.
36541   // TODO: Support different scalar types?
36542   EVT WideSVT = WideInputs[0].getValueType().getScalarType();
36543   if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
36544         return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
36545                Op.getValueType().getScalarType() != WideSVT;
36546       }))
36547     return SDValue();
36548 
36549   for (SDValue &NewInput : WideInputs) {
36550     assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
36551            "Shuffle vector size mismatch");
36552     if (WideSizeInBits > NewInput.getValueSizeInBits())
36553       NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
36554                                 SDLoc(NewInput), WideSizeInBits);
36555     assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
36556            "Unexpected subvector extraction");
36557   }
36558 
36559   // Create new mask for larger type.
36560   for (unsigned i = 1; i != NumInputs; ++i)
36561     Offsets[i] += i * Scale * NumMaskElts;
36562 
36563   SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
36564   for (int &M : WideMask) {
36565     if (M < 0)
36566       continue;
36567     M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
36568   }
36569   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
36570 
36571   // Remove unused/repeated shuffle source ops.
36572   resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
36573   assert(!WideInputs.empty() && "Shuffle with no inputs detected");
36574 
36575   if (WideInputs.size() > 2)
36576     return SDValue();
36577 
36578   // Increase depth for every upper subvector we've peeked through.
36579   Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
36580 
36581   // Attempt to combine wider chain.
36582   // TODO: Can we use a better Root?
36583   SDValue WideRoot = WideInputs[0];
36584   if (SDValue WideShuffle =
36585           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
36586                                  HasVariableMask, AllowVariableCrossLaneMask,
36587                                  AllowVariablePerLaneMask, DAG, Subtarget)) {
36588     WideShuffle =
36589         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
36590     return DAG.getBitcast(RootVT, WideShuffle);
36591   }
36592   return SDValue();
36593 }
36594 
36595 // Canonicalize the combined shuffle mask chain with horizontal ops.
36596 // NOTE: This may update the Ops and Mask.
36597 static SDValue canonicalizeShuffleMaskWithHorizOp(
36598     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
36599     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
36600     const X86Subtarget &Subtarget) {
36601   if (Mask.empty() || Ops.empty())
36602     return SDValue();
36603 
36604   SmallVector<SDValue> BC;
36605   for (SDValue Op : Ops)
36606     BC.push_back(peekThroughBitcasts(Op));
36607 
36608   // All ops must be the same horizop + type.
36609   SDValue BC0 = BC[0];
36610   EVT VT0 = BC0.getValueType();
36611   unsigned Opcode0 = BC0.getOpcode();
36612   if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
36613         return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
36614       }))
36615     return SDValue();
36616 
36617   bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
36618                   Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
36619   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
36620   if (!isHoriz && !isPack)
36621     return SDValue();
36622 
36623   // Do all ops have a single use?
36624   bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
36625     return Op.hasOneUse() &&
36626            peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
36627   });
36628 
36629   int NumElts = VT0.getVectorNumElements();
36630   int NumLanes = VT0.getSizeInBits() / 128;
36631   int NumEltsPerLane = NumElts / NumLanes;
36632   int NumHalfEltsPerLane = NumEltsPerLane / 2;
36633   MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36634   unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36635 
36636   if (NumEltsPerLane >= 4 &&
36637       (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36638     SmallVector<int> LaneMask, ScaledMask;
36639     if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36640         scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36641       // See if we can remove the shuffle by resorting the HOP chain so that
36642       // the HOP args are pre-shuffled.
36643       // TODO: Generalize to any sized/depth chain.
36644       // TODO: Add support for PACKSS/PACKUS.
36645       if (isHoriz) {
36646         // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36647         auto GetHOpSrc = [&](int M) {
36648           if (M == SM_SentinelUndef)
36649             return DAG.getUNDEF(VT0);
36650           if (M == SM_SentinelZero)
36651             return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36652           SDValue Src0 = BC[M / 4];
36653           SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36654           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36655             return Src1.getOperand(M % 2);
36656           return SDValue();
36657         };
36658         SDValue M0 = GetHOpSrc(ScaledMask[0]);
36659         SDValue M1 = GetHOpSrc(ScaledMask[1]);
36660         SDValue M2 = GetHOpSrc(ScaledMask[2]);
36661         SDValue M3 = GetHOpSrc(ScaledMask[3]);
36662         if (M0 && M1 && M2 && M3) {
36663           SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36664           SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
36665           return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36666         }
36667       }
36668       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
36669       if (Ops.size() >= 2) {
36670         SDValue LHS, RHS;
36671         auto GetHOpSrc = [&](int M, int &OutM) {
36672           // TODO: Support SM_SentinelZero
36673           if (M < 0)
36674             return M == SM_SentinelUndef;
36675           SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
36676           if (!LHS || LHS == Src) {
36677             LHS = Src;
36678             OutM = (M % 2);
36679             return true;
36680           }
36681           if (!RHS || RHS == Src) {
36682             RHS = Src;
36683             OutM = (M % 2) + 2;
36684             return true;
36685           }
36686           return false;
36687         };
36688         int PostMask[4] = {-1, -1, -1, -1};
36689         if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
36690             GetHOpSrc(ScaledMask[1], PostMask[1]) &&
36691             GetHOpSrc(ScaledMask[2], PostMask[2]) &&
36692             GetHOpSrc(ScaledMask[3], PostMask[3])) {
36693           LHS = DAG.getBitcast(SrcVT, LHS);
36694           RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
36695           SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36696           // Use SHUFPS for the permute so this will work on SSE3 targets,
36697           // shuffle combining and domain handling will simplify this later on.
36698           MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
36699           Res = DAG.getBitcast(ShuffleVT, Res);
36700           return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
36701                              getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
36702         }
36703       }
36704     }
36705   }
36706 
36707   if (2 < Ops.size())
36708     return SDValue();
36709 
36710   SDValue BC1 = BC[BC.size() - 1];
36711   if (Mask.size() == VT0.getVectorNumElements()) {
36712     // Canonicalize binary shuffles of horizontal ops that use the
36713     // same sources to an unary shuffle.
36714     // TODO: Try to perform this fold even if the shuffle remains.
36715     if (Ops.size() == 2) {
36716       auto ContainsOps = [](SDValue HOp, SDValue Op) {
36717         return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
36718       };
36719       // Commute if all BC0's ops are contained in BC1.
36720       if (ContainsOps(BC1, BC0.getOperand(0)) &&
36721           ContainsOps(BC1, BC0.getOperand(1))) {
36722         ShuffleVectorSDNode::commuteMask(Mask);
36723         std::swap(Ops[0], Ops[1]);
36724         std::swap(BC0, BC1);
36725       }
36726 
36727       // If BC1 can be represented by BC0, then convert to unary shuffle.
36728       if (ContainsOps(BC0, BC1.getOperand(0)) &&
36729           ContainsOps(BC0, BC1.getOperand(1))) {
36730         for (int &M : Mask) {
36731           if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
36732             continue;
36733           int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
36734           M -= NumElts + (SubLane * NumHalfEltsPerLane);
36735           if (BC1.getOperand(SubLane) != BC0.getOperand(0))
36736             M += NumHalfEltsPerLane;
36737         }
36738       }
36739     }
36740 
36741     // Canonicalize unary horizontal ops to only refer to lower halves.
36742     for (int i = 0; i != NumElts; ++i) {
36743       int &M = Mask[i];
36744       if (isUndefOrZero(M))
36745         continue;
36746       if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
36747           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36748         M -= NumHalfEltsPerLane;
36749       if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
36750           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36751         M -= NumHalfEltsPerLane;
36752     }
36753   }
36754 
36755   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36756   // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36757   // represents the LHS/RHS inputs for the lower/upper halves.
36758   SmallVector<int, 16> TargetMask128, WideMask128;
36759   if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36760       scaleShuffleElements(TargetMask128, 2, WideMask128)) {
36761     assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
36762     bool SingleOp = (Ops.size() == 1);
36763     if (isPack || OneUseOps ||
36764         shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
36765       SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
36766       SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
36767       Lo = Lo.getOperand(WideMask128[0] & 1);
36768       Hi = Hi.getOperand(WideMask128[1] & 1);
36769       if (SingleOp) {
36770         SDValue Undef = DAG.getUNDEF(SrcVT);
36771         SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
36772         Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
36773         Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
36774         Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
36775         Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
36776       }
36777       return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
36778     }
36779   }
36780 
36781   return SDValue();
36782 }
36783 
36784 // Attempt to constant fold all of the constant source ops.
36785 // Returns true if the entire shuffle is folded to a constant.
36786 // TODO: Extend this to merge multiple constant Ops and update the mask.
36787 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
36788                                            ArrayRef<int> Mask, SDValue Root,
36789                                            bool HasVariableMask,
36790                                            SelectionDAG &DAG,
36791                                            const X86Subtarget &Subtarget) {
36792   MVT VT = Root.getSimpleValueType();
36793 
36794   unsigned SizeInBits = VT.getSizeInBits();
36795   unsigned NumMaskElts = Mask.size();
36796   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
36797   unsigned NumOps = Ops.size();
36798 
36799   // Extract constant bits from each source op.
36800   bool OneUseConstantOp = false;
36801   SmallVector<APInt, 16> UndefEltsOps(NumOps);
36802   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36803   for (unsigned i = 0; i != NumOps; ++i) {
36804     SDValue SrcOp = Ops[i];
36805     OneUseConstantOp |= SrcOp.hasOneUse();
36806     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36807                                        RawBitsOps[i]))
36808       return SDValue();
36809   }
36810 
36811   // Only fold if at least one of the constants is only used once or
36812   // the combined shuffle has included a variable mask shuffle, this
36813   // is to avoid constant pool bloat.
36814   if (!OneUseConstantOp && !HasVariableMask)
36815     return SDValue();
36816 
36817   // Shuffle the constant bits according to the mask.
36818   SDLoc DL(Root);
36819   APInt UndefElts(NumMaskElts, 0);
36820   APInt ZeroElts(NumMaskElts, 0);
36821   APInt ConstantElts(NumMaskElts, 0);
36822   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36823                                         APInt::getNullValue(MaskSizeInBits));
36824   for (unsigned i = 0; i != NumMaskElts; ++i) {
36825     int M = Mask[i];
36826     if (M == SM_SentinelUndef) {
36827       UndefElts.setBit(i);
36828       continue;
36829     } else if (M == SM_SentinelZero) {
36830       ZeroElts.setBit(i);
36831       continue;
36832     }
36833     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
36834 
36835     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36836     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36837 
36838     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36839     if (SrcUndefElts[SrcMaskIdx]) {
36840       UndefElts.setBit(i);
36841       continue;
36842     }
36843 
36844     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36845     APInt &Bits = SrcEltBits[SrcMaskIdx];
36846     if (!Bits) {
36847       ZeroElts.setBit(i);
36848       continue;
36849     }
36850 
36851     ConstantElts.setBit(i);
36852     ConstantBitData[i] = Bits;
36853   }
36854   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
36855 
36856   // Attempt to create a zero vector.
36857   if ((UndefElts | ZeroElts).isAllOnesValue())
36858     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36859 
36860   // Create the constant data.
36861   MVT MaskSVT;
36862   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36863     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36864   else
36865     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36866 
36867   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36868   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36869     return SDValue();
36870 
36871   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36872   return DAG.getBitcast(VT, CstOp);
36873 }
36874 
36875 namespace llvm {
36876   namespace X86 {
36877     enum {
36878       MaxShuffleCombineDepth = 8
36879     };
36880   }
36881 } // namespace llvm
36882 
36883 /// Fully generic combining of x86 shuffle instructions.
36884 ///
36885 /// This should be the last combine run over the x86 shuffle instructions. Once
36886 /// they have been fully optimized, this will recursively consider all chains
36887 /// of single-use shuffle instructions, build a generic model of the cumulative
36888 /// shuffle operation, and check for simpler instructions which implement this
36889 /// operation. We use this primarily for two purposes:
36890 ///
36891 /// 1) Collapse generic shuffles to specialized single instructions when
36892 ///    equivalent. In most cases, this is just an encoding size win, but
36893 ///    sometimes we will collapse multiple generic shuffles into a single
36894 ///    special-purpose shuffle.
36895 /// 2) Look for sequences of shuffle instructions with 3 or more total
36896 ///    instructions, and replace them with the slightly more expensive SSSE3
36897 ///    PSHUFB instruction if available. We do this as the last combining step
36898 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
36899 ///    a suitable short sequence of other instructions. The PSHUFB will either
36900 ///    use a register or have to read from memory and so is slightly (but only
36901 ///    slightly) more expensive than the other shuffle instructions.
36902 ///
36903 /// Because this is inherently a quadratic operation (for each shuffle in
36904 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36905 /// This should never be an issue in practice as the shuffle lowering doesn't
36906 /// produce sequences of more than 8 instructions.
36907 ///
36908 /// FIXME: We will currently miss some cases where the redundant shuffling
36909 /// would simplify under the threshold for PSHUFB formation because of
36910 /// combine-ordering. To fix this, we should do the redundant instruction
36911 /// combining in this recursive walk.
36912 static SDValue combineX86ShufflesRecursively(
36913     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36914     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36915     unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
36916     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36917     const X86Subtarget &Subtarget) {
36918   assert(RootMask.size() > 0 &&
36919          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
36920          "Illegal shuffle root mask");
36921   assert(Root.getSimpleValueType().isVector() &&
36922          "Shuffles operate on vector types!");
36923   unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36924 
36925   // Bound the depth of our recursive combine because this is ultimately
36926   // quadratic in nature.
36927   if (Depth >= MaxDepth)
36928     return SDValue();
36929 
36930   // Directly rip through bitcasts to find the underlying operand.
36931   SDValue Op = SrcOps[SrcOpIndex];
36932   Op = peekThroughOneUseBitcasts(Op);
36933 
36934   EVT VT = Op.getValueType();
36935   if (!VT.isVector() || !VT.isSimple())
36936     return SDValue(); // Bail if we hit a non-simple non-vector.
36937 
36938   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
36939          "Can only combine shuffles upto size of the root op.");
36940 
36941   // Extract target shuffle mask and resolve sentinels and inputs.
36942   // TODO - determine Op's demanded elts from RootMask.
36943   SmallVector<int, 64> OpMask;
36944   SmallVector<SDValue, 2> OpInputs;
36945   APInt OpUndef, OpZero;
36946   APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36947   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36948   if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
36949                               OpZero, DAG, Depth, false))
36950     return SDValue();
36951 
36952   // Shuffle inputs must not be larger than the shuffle result.
36953   // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
36954   if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
36955         return OpInput.getValueSizeInBits() > VT.getSizeInBits();
36956       }))
36957     return SDValue();
36958 
36959   // If the shuffle result was smaller than the root, we need to adjust the
36960   // mask indices and pad the mask with undefs.
36961   if (RootSizeInBits > VT.getSizeInBits()) {
36962     unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
36963     unsigned OpMaskSize = OpMask.size();
36964     if (OpInputs.size() > 1) {
36965       unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
36966       for (int &M : OpMask) {
36967         if (M < 0)
36968           continue;
36969         int EltIdx = M % OpMaskSize;
36970         int OpIdx = M / OpMaskSize;
36971         M = (PaddedMaskSize * OpIdx) + EltIdx;
36972       }
36973     }
36974     OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
36975     OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
36976     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
36977   }
36978 
36979   SmallVector<int, 64> Mask;
36980   SmallVector<SDValue, 16> Ops;
36981 
36982   // We don't need to merge masks if the root is empty.
36983   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
36984   if (EmptyRoot) {
36985     // Only resolve zeros if it will remove an input, otherwise we might end
36986     // up in an infinite loop.
36987     bool ResolveKnownZeros = true;
36988     if (!OpZero.isNullValue()) {
36989       APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36990       for (int i = 0, e = OpMask.size(); i != e; ++i) {
36991         int M = OpMask[i];
36992         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36993           continue;
36994         UsedInputs.setBit(M / OpMask.size());
36995         if (UsedInputs.isAllOnesValue()) {
36996           ResolveKnownZeros = false;
36997           break;
36998         }
36999       }
37000     }
37001     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
37002                                       ResolveKnownZeros);
37003 
37004     Mask = OpMask;
37005     Ops.append(OpInputs.begin(), OpInputs.end());
37006   } else {
37007     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
37008 
37009     // Add the inputs to the Ops list, avoiding duplicates.
37010     Ops.append(SrcOps.begin(), SrcOps.end());
37011 
37012     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
37013       // Attempt to find an existing match.
37014       SDValue InputBC = peekThroughBitcasts(Input);
37015       for (int i = 0, e = Ops.size(); i < e; ++i)
37016         if (InputBC == peekThroughBitcasts(Ops[i]))
37017           return i;
37018       // Match failed - should we replace an existing Op?
37019       if (InsertionPoint >= 0) {
37020         Ops[InsertionPoint] = Input;
37021         return InsertionPoint;
37022       }
37023       // Add to the end of the Ops list.
37024       Ops.push_back(Input);
37025       return Ops.size() - 1;
37026     };
37027 
37028     SmallVector<int, 2> OpInputIdx;
37029     for (SDValue OpInput : OpInputs)
37030       OpInputIdx.push_back(
37031           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
37032 
37033     assert(((RootMask.size() > OpMask.size() &&
37034              RootMask.size() % OpMask.size() == 0) ||
37035             (OpMask.size() > RootMask.size() &&
37036              OpMask.size() % RootMask.size() == 0) ||
37037             OpMask.size() == RootMask.size()) &&
37038            "The smaller number of elements must divide the larger.");
37039 
37040     // This function can be performance-critical, so we rely on the power-of-2
37041     // knowledge that we have about the mask sizes to replace div/rem ops with
37042     // bit-masks and shifts.
37043     assert(isPowerOf2_32(RootMask.size()) &&
37044            "Non-power-of-2 shuffle mask sizes");
37045     assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
37046     unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
37047     unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
37048 
37049     unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
37050     unsigned RootRatio =
37051         std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
37052     unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
37053     assert((RootRatio == 1 || OpRatio == 1) &&
37054            "Must not have a ratio for both incoming and op masks!");
37055 
37056     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
37057     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
37058     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
37059     unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
37060     unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
37061 
37062     Mask.resize(MaskWidth, SM_SentinelUndef);
37063 
37064     // Merge this shuffle operation's mask into our accumulated mask. Note that
37065     // this shuffle's mask will be the first applied to the input, followed by
37066     // the root mask to get us all the way to the root value arrangement. The
37067     // reason for this order is that we are recursing up the operation chain.
37068     for (unsigned i = 0; i < MaskWidth; ++i) {
37069       unsigned RootIdx = i >> RootRatioLog2;
37070       if (RootMask[RootIdx] < 0) {
37071         // This is a zero or undef lane, we're done.
37072         Mask[i] = RootMask[RootIdx];
37073         continue;
37074       }
37075 
37076       unsigned RootMaskedIdx =
37077           RootRatio == 1
37078               ? RootMask[RootIdx]
37079               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
37080 
37081       // Just insert the scaled root mask value if it references an input other
37082       // than the SrcOp we're currently inserting.
37083       if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
37084           (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
37085         Mask[i] = RootMaskedIdx;
37086         continue;
37087       }
37088 
37089       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
37090       unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
37091       if (OpMask[OpIdx] < 0) {
37092         // The incoming lanes are zero or undef, it doesn't matter which ones we
37093         // are using.
37094         Mask[i] = OpMask[OpIdx];
37095         continue;
37096       }
37097 
37098       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
37099       unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
37100                                           : (OpMask[OpIdx] << OpRatioLog2) +
37101                                                 (RootMaskedIdx & (OpRatio - 1));
37102 
37103       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
37104       int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
37105       assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
37106       OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
37107 
37108       Mask[i] = OpMaskedIdx;
37109     }
37110   }
37111 
37112   // Remove unused/repeated shuffle source ops.
37113   resolveTargetShuffleInputsAndMask(Ops, Mask);
37114 
37115   // Handle the all undef/zero/ones cases early.
37116   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
37117     return DAG.getUNDEF(Root.getValueType());
37118   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
37119     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
37120                          SDLoc(Root));
37121   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
37122       none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
37123     return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
37124 
37125   assert(!Ops.empty() && "Shuffle with no inputs detected");
37126   HasVariableMask |= IsOpVariableMask;
37127 
37128   // Update the list of shuffle nodes that have been combined so far.
37129   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
37130                                                 SrcNodes.end());
37131   CombinedNodes.push_back(Op.getNode());
37132 
37133   // See if we can recurse into each shuffle source op (if it's a target
37134   // shuffle). The source op should only be generally combined if it either has
37135   // a single use (i.e. current Op) or all its users have already been combined,
37136   // if not then we can still combine but should prevent generation of variable
37137   // shuffles to avoid constant pool bloat.
37138   // Don't recurse if we already have more source ops than we can combine in
37139   // the remaining recursion depth.
37140   if (Ops.size() < (MaxDepth - Depth)) {
37141     for (int i = 0, e = Ops.size(); i < e; ++i) {
37142       // For empty roots, we need to resolve zeroable elements before combining
37143       // them with other shuffles.
37144       SmallVector<int, 64> ResolvedMask = Mask;
37145       if (EmptyRoot)
37146         resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
37147       bool AllowCrossLaneVar = false;
37148       bool AllowPerLaneVar = false;
37149       if (Ops[i].getNode()->hasOneUse() ||
37150           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
37151         AllowCrossLaneVar = AllowVariableCrossLaneMask;
37152         AllowPerLaneVar = AllowVariablePerLaneMask;
37153       }
37154       if (SDValue Res = combineX86ShufflesRecursively(
37155               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
37156               HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
37157               Subtarget))
37158         return Res;
37159     }
37160   }
37161 
37162   // Attempt to constant fold all of the constant source ops.
37163   if (SDValue Cst = combineX86ShufflesConstants(
37164           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
37165     return Cst;
37166 
37167   // If constant fold failed and we only have constants - then we have
37168   // multiple uses by a single non-variable shuffle - just bail.
37169   if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37170         APInt UndefElts;
37171         SmallVector<APInt> RawBits;
37172         unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37173         return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37174                                              RawBits);
37175       })) {
37176     return SDValue();
37177   }
37178 
37179   // Canonicalize the combined shuffle mask chain with horizontal ops.
37180   // NOTE: This will update the Ops and Mask.
37181   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
37182           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
37183     return DAG.getBitcast(Root.getValueType(), HOp);
37184 
37185   // Widen any subvector shuffle inputs we've collected.
37186   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
37187         return Op.getValueSizeInBits() < RootSizeInBits;
37188       })) {
37189     for (SDValue &Op : Ops)
37190       if (Op.getValueSizeInBits() < RootSizeInBits)
37191         Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
37192                             RootSizeInBits);
37193     // Reresolve - we might have repeated subvector sources.
37194     resolveTargetShuffleInputsAndMask(Ops, Mask);
37195   }
37196 
37197   // We can only combine unary and binary shuffle mask cases.
37198   if (Ops.size() <= 2) {
37199     // Minor canonicalization of the accumulated shuffle mask to make it easier
37200     // to match below. All this does is detect masks with sequential pairs of
37201     // elements, and shrink them to the half-width mask. It does this in a loop
37202     // so it will reduce the size of the mask to the minimal width mask which
37203     // performs an equivalent shuffle.
37204     while (Mask.size() > 1) {
37205       SmallVector<int, 64> WidenedMask;
37206       if (!canWidenShuffleElements(Mask, WidenedMask))
37207         break;
37208       Mask = std::move(WidenedMask);
37209     }
37210 
37211     // Canonicalization of binary shuffle masks to improve pattern matching by
37212     // commuting the inputs.
37213     if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
37214       ShuffleVectorSDNode::commuteMask(Mask);
37215       std::swap(Ops[0], Ops[1]);
37216     }
37217 
37218     // Finally, try to combine into a single shuffle instruction.
37219     return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
37220                                   AllowVariableCrossLaneMask,
37221                                   AllowVariablePerLaneMask, DAG, Subtarget);
37222   }
37223 
37224   // If that failed and any input is extracted then try to combine as a
37225   // shuffle with the larger type.
37226   return combineX86ShuffleChainWithExtract(
37227       Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
37228       AllowVariablePerLaneMask, DAG, Subtarget);
37229 }
37230 
37231 /// Helper entry wrapper to combineX86ShufflesRecursively.
37232 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
37233                                              const X86Subtarget &Subtarget) {
37234   return combineX86ShufflesRecursively(
37235       {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
37236       /*HasVarMask*/ false,
37237       /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
37238       Subtarget);
37239 }
37240 
37241 /// Get the PSHUF-style mask from PSHUF node.
37242 ///
37243 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37244 /// PSHUF-style masks that can be reused with such instructions.
37245 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37246   MVT VT = N.getSimpleValueType();
37247   SmallVector<int, 4> Mask;
37248   SmallVector<SDValue, 2> Ops;
37249   bool HaveMask =
37250       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37251   (void)HaveMask;
37252   assert(HaveMask);
37253 
37254   // If we have more than 128-bits, only the low 128-bits of shuffle mask
37255   // matter. Check that the upper masks are repeats and remove them.
37256   if (VT.getSizeInBits() > 128) {
37257     int LaneElts = 128 / VT.getScalarSizeInBits();
37258 #ifndef NDEBUG
37259     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37260       for (int j = 0; j < LaneElts; ++j)
37261         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
37262                "Mask doesn't repeat in high 128-bit lanes!");
37263 #endif
37264     Mask.resize(LaneElts);
37265   }
37266 
37267   switch (N.getOpcode()) {
37268   case X86ISD::PSHUFD:
37269     return Mask;
37270   case X86ISD::PSHUFLW:
37271     Mask.resize(4);
37272     return Mask;
37273   case X86ISD::PSHUFHW:
37274     Mask.erase(Mask.begin(), Mask.begin() + 4);
37275     for (int &M : Mask)
37276       M -= 4;
37277     return Mask;
37278   default:
37279     llvm_unreachable("No valid shuffle instruction found!");
37280   }
37281 }
37282 
37283 /// Search for a combinable shuffle across a chain ending in pshufd.
37284 ///
37285 /// We walk up the chain and look for a combinable shuffle, skipping over
37286 /// shuffles that we could hoist this shuffle's transformation past without
37287 /// altering anything.
37288 static SDValue
37289 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37290                              SelectionDAG &DAG) {
37291   assert(N.getOpcode() == X86ISD::PSHUFD &&
37292          "Called with something other than an x86 128-bit half shuffle!");
37293   SDLoc DL(N);
37294 
37295   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37296   // of the shuffles in the chain so that we can form a fresh chain to replace
37297   // this one.
37298   SmallVector<SDValue, 8> Chain;
37299   SDValue V = N.getOperand(0);
37300   for (; V.hasOneUse(); V = V.getOperand(0)) {
37301     switch (V.getOpcode()) {
37302     default:
37303       return SDValue(); // Nothing combined!
37304 
37305     case ISD::BITCAST:
37306       // Skip bitcasts as we always know the type for the target specific
37307       // instructions.
37308       continue;
37309 
37310     case X86ISD::PSHUFD:
37311       // Found another dword shuffle.
37312       break;
37313 
37314     case X86ISD::PSHUFLW:
37315       // Check that the low words (being shuffled) are the identity in the
37316       // dword shuffle, and the high words are self-contained.
37317       if (Mask[0] != 0 || Mask[1] != 1 ||
37318           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37319         return SDValue();
37320 
37321       Chain.push_back(V);
37322       continue;
37323 
37324     case X86ISD::PSHUFHW:
37325       // Check that the high words (being shuffled) are the identity in the
37326       // dword shuffle, and the low words are self-contained.
37327       if (Mask[2] != 2 || Mask[3] != 3 ||
37328           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37329         return SDValue();
37330 
37331       Chain.push_back(V);
37332       continue;
37333 
37334     case X86ISD::UNPCKL:
37335     case X86ISD::UNPCKH:
37336       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
37337       // shuffle into a preceding word shuffle.
37338       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
37339           V.getSimpleValueType().getVectorElementType() != MVT::i16)
37340         return SDValue();
37341 
37342       // Search for a half-shuffle which we can combine with.
37343       unsigned CombineOp =
37344           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
37345       if (V.getOperand(0) != V.getOperand(1) ||
37346           !V->isOnlyUserOf(V.getOperand(0).getNode()))
37347         return SDValue();
37348       Chain.push_back(V);
37349       V = V.getOperand(0);
37350       do {
37351         switch (V.getOpcode()) {
37352         default:
37353           return SDValue(); // Nothing to combine.
37354 
37355         case X86ISD::PSHUFLW:
37356         case X86ISD::PSHUFHW:
37357           if (V.getOpcode() == CombineOp)
37358             break;
37359 
37360           Chain.push_back(V);
37361 
37362           LLVM_FALLTHROUGH;
37363         case ISD::BITCAST:
37364           V = V.getOperand(0);
37365           continue;
37366         }
37367         break;
37368       } while (V.hasOneUse());
37369       break;
37370     }
37371     // Break out of the loop if we break out of the switch.
37372     break;
37373   }
37374 
37375   if (!V.hasOneUse())
37376     // We fell out of the loop without finding a viable combining instruction.
37377     return SDValue();
37378 
37379   // Merge this node's mask and our incoming mask.
37380   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37381   for (int &M : Mask)
37382     M = VMask[M];
37383   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
37384                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
37385 
37386   // Rebuild the chain around this new shuffle.
37387   while (!Chain.empty()) {
37388     SDValue W = Chain.pop_back_val();
37389 
37390     if (V.getValueType() != W.getOperand(0).getValueType())
37391       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
37392 
37393     switch (W.getOpcode()) {
37394     default:
37395       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
37396 
37397     case X86ISD::UNPCKL:
37398     case X86ISD::UNPCKH:
37399       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
37400       break;
37401 
37402     case X86ISD::PSHUFD:
37403     case X86ISD::PSHUFLW:
37404     case X86ISD::PSHUFHW:
37405       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
37406       break;
37407     }
37408   }
37409   if (V.getValueType() != N.getValueType())
37410     V = DAG.getBitcast(N.getValueType(), V);
37411 
37412   // Return the new chain to replace N.
37413   return V;
37414 }
37415 
37416 // Attempt to commute shufps LHS loads:
37417 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
37418 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
37419                                       SelectionDAG &DAG) {
37420   // TODO: Add vXf64 support.
37421   if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
37422     return SDValue();
37423 
37424   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
37425   auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
37426     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
37427       return SDValue();
37428     SDValue N0 = V.getOperand(0);
37429     SDValue N1 = V.getOperand(1);
37430     unsigned Imm = V.getConstantOperandVal(2);
37431     if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
37432         MayFoldLoad(peekThroughOneUseBitcasts(N1)))
37433       return SDValue();
37434     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
37435     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
37436                        DAG.getTargetConstant(Imm, DL, MVT::i8));
37437   };
37438 
37439   switch (N.getOpcode()) {
37440   case X86ISD::VPERMILPI:
37441     if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
37442       unsigned Imm = N.getConstantOperandVal(1);
37443       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
37444                          DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37445     }
37446     break;
37447   case X86ISD::SHUFP: {
37448     SDValue N0 = N.getOperand(0);
37449     SDValue N1 = N.getOperand(1);
37450     unsigned Imm = N.getConstantOperandVal(2);
37451     if (N0 == N1) {
37452       if (SDValue NewSHUFP = commuteSHUFP(N, N0))
37453         return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
37454                            DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37455     } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
37456       return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
37457                          DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
37458     } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
37459       return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
37460                          DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
37461     }
37462     break;
37463   }
37464   }
37465 
37466   return SDValue();
37467 }
37468 
37469 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
37470 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
37471                                              const SDLoc &DL) {
37472   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37473   EVT ShuffleVT = N.getValueType();
37474 
37475   auto IsMergeableWithShuffle = [](SDValue Op) {
37476     // AllZeros/AllOnes constants are freely shuffled and will peek through
37477     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
37478     // merge with target shuffles if it has one use so shuffle combining is
37479     // likely to kick in.
37480     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
37481            ISD::isBuildVectorAllZeros(Op.getNode()) ||
37482            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
37483            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
37484            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
37485   };
37486   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
37487     // Ensure we only shuffle whole vector src elements, unless its a logical
37488     // binops where we can more aggressively move shuffles from dst to src.
37489     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
37490            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
37491   };
37492 
37493   unsigned Opc = N.getOpcode();
37494   switch (Opc) {
37495   // Unary and Unary+Permute Shuffles.
37496   case X86ISD::PSHUFB: {
37497     // Don't merge PSHUFB if it contains zero'd elements.
37498     SmallVector<int> Mask;
37499     SmallVector<SDValue> Ops;
37500     if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
37501                               Mask))
37502       break;
37503     LLVM_FALLTHROUGH;
37504   }
37505   case X86ISD::VBROADCAST:
37506   case X86ISD::MOVDDUP:
37507   case X86ISD::PSHUFD:
37508   case X86ISD::VPERMI:
37509   case X86ISD::VPERMILPI: {
37510     if (N.getOperand(0).getValueType() == ShuffleVT &&
37511         N->isOnlyUserOf(N.getOperand(0).getNode())) {
37512       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37513       unsigned SrcOpcode = N0.getOpcode();
37514       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
37515         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37516         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37517         if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
37518           SDValue LHS, RHS;
37519           Op00 = DAG.getBitcast(ShuffleVT, Op00);
37520           Op01 = DAG.getBitcast(ShuffleVT, Op01);
37521           if (N.getNumOperands() == 2) {
37522             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
37523             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
37524           } else {
37525             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
37526             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
37527           }
37528           EVT OpVT = N0.getValueType();
37529           return DAG.getBitcast(ShuffleVT,
37530                                 DAG.getNode(SrcOpcode, DL, OpVT,
37531                                             DAG.getBitcast(OpVT, LHS),
37532                                             DAG.getBitcast(OpVT, RHS)));
37533         }
37534       }
37535     }
37536     break;
37537   }
37538   // Binary and Binary+Permute Shuffles.
37539   case X86ISD::INSERTPS: {
37540     // Don't merge INSERTPS if it contains zero'd elements.
37541     unsigned InsertPSMask = N.getConstantOperandVal(2);
37542     unsigned ZeroMask = InsertPSMask & 0xF;
37543     if (ZeroMask != 0)
37544       break;
37545     LLVM_FALLTHROUGH;
37546   }
37547   case X86ISD::MOVSD:
37548   case X86ISD::MOVSS:
37549   case X86ISD::BLENDI:
37550   case X86ISD::SHUFP:
37551   case X86ISD::UNPCKH:
37552   case X86ISD::UNPCKL: {
37553     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
37554         N->isOnlyUserOf(N.getOperand(1).getNode())) {
37555       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37556       SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
37557       unsigned SrcOpcode = N0.getOpcode();
37558       if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
37559           IsSafeToMoveShuffle(N0, SrcOpcode) &&
37560           IsSafeToMoveShuffle(N1, SrcOpcode)) {
37561         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37562         SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
37563         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37564         SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
37565         // Ensure the total number of shuffles doesn't increase by folding this
37566         // shuffle through to the source ops.
37567         if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
37568              (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
37569             ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
37570              (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
37571           SDValue LHS, RHS;
37572           Op00 = DAG.getBitcast(ShuffleVT, Op00);
37573           Op10 = DAG.getBitcast(ShuffleVT, Op10);
37574           Op01 = DAG.getBitcast(ShuffleVT, Op01);
37575           Op11 = DAG.getBitcast(ShuffleVT, Op11);
37576           if (N.getNumOperands() == 3) {
37577             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
37578             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
37579           } else {
37580             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
37581             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
37582           }
37583           EVT OpVT = N0.getValueType();
37584           return DAG.getBitcast(ShuffleVT,
37585                                 DAG.getNode(SrcOpcode, DL, OpVT,
37586                                             DAG.getBitcast(OpVT, LHS),
37587                                             DAG.getBitcast(OpVT, RHS)));
37588         }
37589       }
37590     }
37591     break;
37592   }
37593   }
37594   return SDValue();
37595 }
37596 
37597 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
37598 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
37599                                                       SelectionDAG &DAG,
37600                                                       const SDLoc &DL) {
37601   assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
37602 
37603   MVT VT = V.getSimpleValueType();
37604   SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
37605   SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
37606   unsigned SrcOpc0 = Src0.getOpcode();
37607   unsigned SrcOpc1 = Src1.getOpcode();
37608   EVT SrcVT0 = Src0.getValueType();
37609   EVT SrcVT1 = Src1.getValueType();
37610 
37611   if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
37612     return SDValue();
37613 
37614   switch (SrcOpc0) {
37615   case X86ISD::MOVDDUP: {
37616     SDValue LHS = Src0.getOperand(0);
37617     SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37618     SDValue Res =
37619         DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
37620     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
37621     return DAG.getBitcast(VT, Res);
37622   }
37623   case X86ISD::VPERMILPI:
37624     // TODO: Handle v4f64 permutes with different low/high lane masks.
37625     if (SrcVT0 == MVT::v4f64) {
37626       uint64_t Mask = Src0.getConstantOperandVal(1);
37627       if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
37628         break;
37629     }
37630     LLVM_FALLTHROUGH;
37631   case X86ISD::VSHLI:
37632   case X86ISD::VSRLI:
37633   case X86ISD::VSRAI:
37634   case X86ISD::PSHUFD:
37635     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
37636       SDValue LHS = Src0.getOperand(0);
37637       SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37638       SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
37639                                 V.getOperand(2));
37640       Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
37641       return DAG.getBitcast(VT, Res);
37642     }
37643     break;
37644   }
37645 
37646   return SDValue();
37647 }
37648 
37649 /// Try to combine x86 target specific shuffles.
37650 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37651                                     TargetLowering::DAGCombinerInfo &DCI,
37652                                     const X86Subtarget &Subtarget) {
37653   SDLoc DL(N);
37654   MVT VT = N.getSimpleValueType();
37655   SmallVector<int, 4> Mask;
37656   unsigned Opcode = N.getOpcode();
37657 
37658   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
37659     return R;
37660 
37661   if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
37662     return R;
37663 
37664   // Handle specific target shuffles.
37665   switch (Opcode) {
37666   case X86ISD::MOVDDUP: {
37667     SDValue Src = N.getOperand(0);
37668     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
37669     if (VT == MVT::v2f64 && Src.hasOneUse() &&
37670         ISD::isNormalLoad(Src.getNode())) {
37671       LoadSDNode *LN = cast<LoadSDNode>(Src);
37672       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
37673         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
37674         DCI.CombineTo(N.getNode(), Movddup);
37675         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37676         DCI.recursivelyDeleteUnusedNodes(LN);
37677         return N; // Return N so it doesn't get rechecked!
37678       }
37679     }
37680 
37681     return SDValue();
37682   }
37683   case X86ISD::VBROADCAST: {
37684     SDValue Src = N.getOperand(0);
37685     SDValue BC = peekThroughBitcasts(Src);
37686     EVT SrcVT = Src.getValueType();
37687     EVT BCVT = BC.getValueType();
37688 
37689     // If broadcasting from another shuffle, attempt to simplify it.
37690     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
37691     if (isTargetShuffle(BC.getOpcode()) &&
37692         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
37693       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
37694       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
37695                                         SM_SentinelUndef);
37696       for (unsigned i = 0; i != Scale; ++i)
37697         DemandedMask[i] = i;
37698       if (SDValue Res = combineX86ShufflesRecursively(
37699               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
37700               X86::MaxShuffleCombineDepth,
37701               /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
37702               /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
37703         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37704                            DAG.getBitcast(SrcVT, Res));
37705     }
37706 
37707     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
37708     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
37709     if (Src.getOpcode() == ISD::BITCAST &&
37710         SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
37711         DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
37712         FixedVectorType::isValidElementType(
37713             BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
37714       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
37715                                    VT.getVectorNumElements());
37716       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
37717     }
37718 
37719     // Reduce broadcast source vector to lowest 128-bits.
37720     if (SrcVT.getSizeInBits() > 128)
37721       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37722                          extract128BitVector(Src, 0, DAG, DL));
37723 
37724     // broadcast(scalar_to_vector(x)) -> broadcast(x).
37725     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
37726       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
37727 
37728     // Share broadcast with the longest vector and extract low subvector (free).
37729     // Ensure the same SDValue from the SDNode use is being used.
37730     for (SDNode *User : Src->uses())
37731       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
37732           Src == User->getOperand(0) &&
37733           User->getValueSizeInBits(0).getFixedSize() >
37734               VT.getFixedSizeInBits()) {
37735         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
37736                                 VT.getSizeInBits());
37737       }
37738 
37739     // vbroadcast(scalarload X) -> vbroadcast_load X
37740     // For float loads, extract other uses of the scalar from the broadcast.
37741     if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
37742         ISD::isNormalLoad(Src.getNode())) {
37743       LoadSDNode *LN = cast<LoadSDNode>(Src);
37744       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37745       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37746       SDValue BcastLd =
37747           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37748                                   LN->getMemoryVT(), LN->getMemOperand());
37749       // If the load value is used only by N, replace it via CombineTo N.
37750       bool NoReplaceExtract = Src.hasOneUse();
37751       DCI.CombineTo(N.getNode(), BcastLd);
37752       if (NoReplaceExtract) {
37753         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37754         DCI.recursivelyDeleteUnusedNodes(LN);
37755       } else {
37756         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
37757                                   DAG.getIntPtrConstant(0, DL));
37758         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
37759       }
37760       return N; // Return N so it doesn't get rechecked!
37761     }
37762 
37763     // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
37764     // i16. So shrink it ourselves if we can make a broadcast_load.
37765     if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
37766         Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
37767       assert(Subtarget.hasAVX2() && "Expected AVX2");
37768       SDValue TruncIn = Src.getOperand(0);
37769 
37770       // If this is a truncate of a non extending load we can just narrow it to
37771       // use a broadcast_load.
37772       if (ISD::isNormalLoad(TruncIn.getNode())) {
37773         LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
37774         // Unless its volatile or atomic.
37775         if (LN->isSimple()) {
37776           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37777           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37778           SDValue BcastLd = DAG.getMemIntrinsicNode(
37779               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37780               LN->getPointerInfo(), LN->getOriginalAlign(),
37781               LN->getMemOperand()->getFlags());
37782           DCI.CombineTo(N.getNode(), BcastLd);
37783           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37784           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37785           return N; // Return N so it doesn't get rechecked!
37786         }
37787       }
37788 
37789       // If this is a truncate of an i16 extload, we can directly replace it.
37790       if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
37791           ISD::isEXTLoad(Src.getOperand(0).getNode())) {
37792         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
37793         if (LN->getMemoryVT().getSizeInBits() == 16) {
37794           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37795           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37796           SDValue BcastLd =
37797               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37798                                       LN->getMemoryVT(), LN->getMemOperand());
37799           DCI.CombineTo(N.getNode(), BcastLd);
37800           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37801           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37802           return N; // Return N so it doesn't get rechecked!
37803         }
37804       }
37805 
37806       // If this is a truncate of load that has been shifted right, we can
37807       // offset the pointer and use a narrower load.
37808       if (TruncIn.getOpcode() == ISD::SRL &&
37809           TruncIn.getOperand(0).hasOneUse() &&
37810           isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
37811           ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
37812         LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
37813         unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
37814         // Make sure the shift amount and the load size are divisible by 16.
37815         // Don't do this if the load is volatile or atomic.
37816         if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
37817             LN->isSimple()) {
37818           unsigned Offset = ShiftAmt / 8;
37819           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37820           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
37821                                                  TypeSize::Fixed(Offset), DL);
37822           SDValue Ops[] = { LN->getChain(), Ptr };
37823           SDValue BcastLd = DAG.getMemIntrinsicNode(
37824               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37825               LN->getPointerInfo().getWithOffset(Offset),
37826               LN->getOriginalAlign(),
37827               LN->getMemOperand()->getFlags());
37828           DCI.CombineTo(N.getNode(), BcastLd);
37829           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37830           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37831           return N; // Return N so it doesn't get rechecked!
37832         }
37833       }
37834     }
37835 
37836     // vbroadcast(vzload X) -> vbroadcast_load X
37837     if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
37838       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
37839       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
37840         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37841         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37842         SDValue BcastLd =
37843             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37844                                     LN->getMemoryVT(), LN->getMemOperand());
37845         DCI.CombineTo(N.getNode(), BcastLd);
37846         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37847         DCI.recursivelyDeleteUnusedNodes(LN);
37848         return N; // Return N so it doesn't get rechecked!
37849       }
37850     }
37851 
37852     // vbroadcast(vector load X) -> vbroadcast_load
37853     if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
37854          SrcVT == MVT::v4i32) &&
37855         Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
37856       LoadSDNode *LN = cast<LoadSDNode>(Src);
37857       // Unless the load is volatile or atomic.
37858       if (LN->isSimple()) {
37859         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37860         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37861         SDValue BcastLd = DAG.getMemIntrinsicNode(
37862             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
37863             LN->getPointerInfo(), LN->getOriginalAlign(),
37864             LN->getMemOperand()->getFlags());
37865         DCI.CombineTo(N.getNode(), BcastLd);
37866         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37867         DCI.recursivelyDeleteUnusedNodes(LN);
37868         return N; // Return N so it doesn't get rechecked!
37869       }
37870     }
37871 
37872     return SDValue();
37873   }
37874   case X86ISD::VZEXT_MOVL: {
37875     SDValue N0 = N.getOperand(0);
37876 
37877     // If this a vzmovl of a full vector load, replace it with a vzload, unless
37878     // the load is volatile.
37879     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
37880       auto *LN = cast<LoadSDNode>(N0);
37881       if (SDValue VZLoad =
37882               narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
37883         DCI.CombineTo(N.getNode(), VZLoad);
37884         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37885         DCI.recursivelyDeleteUnusedNodes(LN);
37886         return N;
37887       }
37888     }
37889 
37890     // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
37891     // and can just use a VZEXT_LOAD.
37892     // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
37893     if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
37894       auto *LN = cast<MemSDNode>(N0);
37895       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
37896         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37897         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37898         SDValue VZLoad =
37899             DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
37900                                     LN->getMemoryVT(), LN->getMemOperand());
37901         DCI.CombineTo(N.getNode(), VZLoad);
37902         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37903         DCI.recursivelyDeleteUnusedNodes(LN);
37904         return N;
37905       }
37906     }
37907 
37908     // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
37909     // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
37910     // if the upper bits of the i64 are zero.
37911     if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37912         N0.getOperand(0).hasOneUse() &&
37913         N0.getOperand(0).getValueType() == MVT::i64) {
37914       SDValue In = N0.getOperand(0);
37915       APInt Mask = APInt::getHighBitsSet(64, 32);
37916       if (DAG.MaskedValueIsZero(In, Mask)) {
37917         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
37918         MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
37919         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
37920         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
37921         return DAG.getBitcast(VT, Movl);
37922       }
37923     }
37924 
37925     // Load a scalar integer constant directly to XMM instead of transferring an
37926     // immediate value from GPR.
37927     // vzext_movl (scalar_to_vector C) --> load [C,0...]
37928     if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37929       if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
37930         // Create a vector constant - scalar constant followed by zeros.
37931         EVT ScalarVT = N0.getOperand(0).getValueType();
37932         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
37933         unsigned NumElts = VT.getVectorNumElements();
37934         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
37935         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
37936         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
37937 
37938         // Load the vector constant from constant pool.
37939         MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
37940         SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
37941         MachinePointerInfo MPI =
37942             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
37943         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
37944         return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
37945                            MachineMemOperand::MOLoad);
37946       }
37947     }
37948 
37949     // Pull subvector inserts into undef through VZEXT_MOVL by making it an
37950     // insert into a zero vector. This helps get VZEXT_MOVL closer to
37951     // scalar_to_vectors where 256/512 are canonicalized to an insert and a
37952     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
37953     if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
37954       SDValue V = peekThroughOneUseBitcasts(N0);
37955 
37956       if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
37957           isNullConstant(V.getOperand(2))) {
37958         SDValue In = V.getOperand(1);
37959         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
37960                                      In.getValueSizeInBits() /
37961                                          VT.getScalarSizeInBits());
37962         In = DAG.getBitcast(SubVT, In);
37963         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
37964         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
37965                            getZeroVector(VT, Subtarget, DAG, DL), Movl,
37966                            V.getOperand(2));
37967       }
37968     }
37969 
37970     return SDValue();
37971   }
37972   case X86ISD::BLENDI: {
37973     SDValue N0 = N.getOperand(0);
37974     SDValue N1 = N.getOperand(1);
37975 
37976     // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
37977     // TODO: Handle MVT::v16i16 repeated blend mask.
37978     if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
37979         N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
37980       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
37981       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
37982           SrcVT.getScalarSizeInBits() >= 32) {
37983         unsigned BlendMask = N.getConstantOperandVal(2);
37984         unsigned Size = VT.getVectorNumElements();
37985         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
37986         BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
37987         return DAG.getBitcast(
37988             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
37989                             N1.getOperand(0),
37990                             DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
37991       }
37992     }
37993     return SDValue();
37994   }
37995   case X86ISD::VPERMI: {
37996     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
37997     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
37998     SDValue N0 = N.getOperand(0);
37999     SDValue N1 = N.getOperand(1);
38000     unsigned EltSizeInBits = VT.getScalarSizeInBits();
38001     if (N0.getOpcode() == ISD::BITCAST &&
38002         N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
38003       SDValue Src = N0.getOperand(0);
38004       EVT SrcVT = Src.getValueType();
38005       SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
38006       return DAG.getBitcast(VT, Res);
38007     }
38008     return SDValue();
38009   }
38010   case X86ISD::VPERM2X128: {
38011     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
38012     SDValue LHS = N->getOperand(0);
38013     SDValue RHS = N->getOperand(1);
38014     if (LHS.getOpcode() == ISD::BITCAST &&
38015         (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
38016       EVT SrcVT = LHS.getOperand(0).getValueType();
38017       if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
38018         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
38019                                               DAG.getBitcast(SrcVT, LHS),
38020                                               DAG.getBitcast(SrcVT, RHS),
38021                                               N->getOperand(2)));
38022       }
38023     }
38024 
38025     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
38026     if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
38027       return Res;
38028 
38029     // Fold vperm2x128 subvector shuffle with an inner concat pattern.
38030     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
38031     auto FindSubVector128 = [&](unsigned Idx) {
38032       if (Idx > 3)
38033         return SDValue();
38034       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
38035       SmallVector<SDValue> SubOps;
38036       if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
38037         return SubOps[Idx & 1];
38038       unsigned NumElts = Src.getValueType().getVectorNumElements();
38039       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
38040           Src.getOperand(1).getValueSizeInBits() == 128 &&
38041           Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
38042         return Src.getOperand(1);
38043       }
38044       return SDValue();
38045     };
38046     unsigned Imm = N.getConstantOperandVal(2);
38047     if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
38048       if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
38049         MVT SubVT = VT.getHalfNumVectorElementsVT();
38050         SubLo = DAG.getBitcast(SubVT, SubLo);
38051         SubHi = DAG.getBitcast(SubVT, SubHi);
38052         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
38053       }
38054     }
38055     return SDValue();
38056   }
38057   case X86ISD::PSHUFD:
38058   case X86ISD::PSHUFLW:
38059   case X86ISD::PSHUFHW:
38060     Mask = getPSHUFShuffleMask(N);
38061     assert(Mask.size() == 4);
38062     break;
38063   case X86ISD::MOVSD:
38064   case X86ISD::MOVSS: {
38065     SDValue N0 = N.getOperand(0);
38066     SDValue N1 = N.getOperand(1);
38067 
38068     // Canonicalize scalar FPOps:
38069     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
38070     // If commutable, allow OP(N1[0], N0[0]).
38071     unsigned Opcode1 = N1.getOpcode();
38072     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
38073         Opcode1 == ISD::FDIV) {
38074       SDValue N10 = N1.getOperand(0);
38075       SDValue N11 = N1.getOperand(1);
38076       if (N10 == N0 ||
38077           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
38078         if (N10 != N0)
38079           std::swap(N10, N11);
38080         MVT SVT = VT.getVectorElementType();
38081         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
38082         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
38083         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
38084         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
38085         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
38086         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
38087       }
38088     }
38089 
38090     return SDValue();
38091   }
38092   case X86ISD::INSERTPS: {
38093     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
38094     SDValue Op0 = N.getOperand(0);
38095     SDValue Op1 = N.getOperand(1);
38096     unsigned InsertPSMask = N.getConstantOperandVal(2);
38097     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
38098     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
38099     unsigned ZeroMask = InsertPSMask & 0xF;
38100 
38101     // If we zero out all elements from Op0 then we don't need to reference it.
38102     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
38103       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
38104                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38105 
38106     // If we zero out the element from Op1 then we don't need to reference it.
38107     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
38108       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38109                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38110 
38111     // Attempt to merge insertps Op1 with an inner target shuffle node.
38112     SmallVector<int, 8> TargetMask1;
38113     SmallVector<SDValue, 2> Ops1;
38114     APInt KnownUndef1, KnownZero1;
38115     if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
38116                                      KnownZero1)) {
38117       if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
38118         // Zero/UNDEF insertion - zero out element and remove dependency.
38119         InsertPSMask |= (1u << DstIdx);
38120         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38121                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38122       }
38123       // Update insertps mask srcidx and reference the source input directly.
38124       int M = TargetMask1[SrcIdx];
38125       assert(0 <= M && M < 8 && "Shuffle index out of range");
38126       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
38127       Op1 = Ops1[M < 4 ? 0 : 1];
38128       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38129                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38130     }
38131 
38132     // Attempt to merge insertps Op0 with an inner target shuffle node.
38133     SmallVector<int, 8> TargetMask0;
38134     SmallVector<SDValue, 2> Ops0;
38135     APInt KnownUndef0, KnownZero0;
38136     if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
38137                                      KnownZero0)) {
38138       bool Updated = false;
38139       bool UseInput00 = false;
38140       bool UseInput01 = false;
38141       for (int i = 0; i != 4; ++i) {
38142         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
38143           // No change if element is already zero or the inserted element.
38144           continue;
38145         } else if (KnownUndef0[i] || KnownZero0[i]) {
38146           // If the target mask is undef/zero then we must zero the element.
38147           InsertPSMask |= (1u << i);
38148           Updated = true;
38149           continue;
38150         }
38151 
38152         // The input vector element must be inline.
38153         int M = TargetMask0[i];
38154         if (M != i && M != (i + 4))
38155           return SDValue();
38156 
38157         // Determine which inputs of the target shuffle we're using.
38158         UseInput00 |= (0 <= M && M < 4);
38159         UseInput01 |= (4 <= M);
38160       }
38161 
38162       // If we're not using both inputs of the target shuffle then use the
38163       // referenced input directly.
38164       if (UseInput00 && !UseInput01) {
38165         Updated = true;
38166         Op0 = Ops0[0];
38167       } else if (!UseInput00 && UseInput01) {
38168         Updated = true;
38169         Op0 = Ops0[1];
38170       }
38171 
38172       if (Updated)
38173         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38174                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38175     }
38176 
38177     // If we're inserting an element from a vbroadcast load, fold the
38178     // load into the X86insertps instruction. We need to convert the scalar
38179     // load to a vector and clear the source lane of the INSERTPS control.
38180     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
38181       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
38182       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
38183         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
38184                                    MemIntr->getBasePtr(),
38185                                    MemIntr->getMemOperand());
38186         SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
38187                            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
38188                                        Load),
38189                            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
38190         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
38191         return Insert;
38192       }
38193     }
38194 
38195     return SDValue();
38196   }
38197   default:
38198     return SDValue();
38199   }
38200 
38201   // Nuke no-op shuffles that show up after combining.
38202   if (isNoopShuffleMask(Mask))
38203     return N.getOperand(0);
38204 
38205   // Look for simplifications involving one or two shuffle instructions.
38206   SDValue V = N.getOperand(0);
38207   switch (N.getOpcode()) {
38208   default:
38209     break;
38210   case X86ISD::PSHUFLW:
38211   case X86ISD::PSHUFHW:
38212     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
38213 
38214     // See if this reduces to a PSHUFD which is no more expensive and can
38215     // combine with more operations. Note that it has to at least flip the
38216     // dwords as otherwise it would have been removed as a no-op.
38217     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
38218       int DMask[] = {0, 1, 2, 3};
38219       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
38220       DMask[DOffset + 0] = DOffset + 1;
38221       DMask[DOffset + 1] = DOffset + 0;
38222       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
38223       V = DAG.getBitcast(DVT, V);
38224       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
38225                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
38226       return DAG.getBitcast(VT, V);
38227     }
38228 
38229     // Look for shuffle patterns which can be implemented as a single unpack.
38230     // FIXME: This doesn't handle the location of the PSHUFD generically, and
38231     // only works when we have a PSHUFD followed by two half-shuffles.
38232     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
38233         (V.getOpcode() == X86ISD::PSHUFLW ||
38234          V.getOpcode() == X86ISD::PSHUFHW) &&
38235         V.getOpcode() != N.getOpcode() &&
38236         V.hasOneUse() && V.getOperand(0).hasOneUse()) {
38237       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
38238       if (D.getOpcode() == X86ISD::PSHUFD) {
38239         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38240         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38241         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38242         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38243         int WordMask[8];
38244         for (int i = 0; i < 4; ++i) {
38245           WordMask[i + NOffset] = Mask[i] + NOffset;
38246           WordMask[i + VOffset] = VMask[i] + VOffset;
38247         }
38248         // Map the word mask through the DWord mask.
38249         int MappedMask[8];
38250         for (int i = 0; i < 8; ++i)
38251           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38252         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38253             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38254           // We can replace all three shuffles with an unpack.
38255           V = DAG.getBitcast(VT, D.getOperand(0));
38256           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38257                                                 : X86ISD::UNPCKH,
38258                              DL, VT, V, V);
38259         }
38260       }
38261     }
38262 
38263     break;
38264 
38265   case X86ISD::PSHUFD:
38266     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38267       return NewN;
38268 
38269     break;
38270   }
38271 
38272   return SDValue();
38273 }
38274 
38275 /// Checks if the shuffle mask takes subsequent elements
38276 /// alternately from two vectors.
38277 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
38278 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38279 
38280   int ParitySrc[2] = {-1, -1};
38281   unsigned Size = Mask.size();
38282   for (unsigned i = 0; i != Size; ++i) {
38283     int M = Mask[i];
38284     if (M < 0)
38285       continue;
38286 
38287     // Make sure we are using the matching element from the input.
38288     if ((M % Size) != i)
38289       return false;
38290 
38291     // Make sure we use the same input for all elements of the same parity.
38292     int Src = M / Size;
38293     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38294       return false;
38295     ParitySrc[i % 2] = Src;
38296   }
38297 
38298   // Make sure each input is used.
38299   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38300     return false;
38301 
38302   Op0Even = ParitySrc[0] == 0;
38303   return true;
38304 }
38305 
38306 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38307 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38308 /// are written to the parameters \p Opnd0 and \p Opnd1.
38309 ///
38310 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38311 /// so it is easier to generically match. We also insert dummy vector shuffle
38312 /// nodes for the operands which explicitly discard the lanes which are unused
38313 /// by this operation to try to flow through the rest of the combiner
38314 /// the fact that they're unused.
38315 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38316                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38317                              bool &IsSubAdd) {
38318 
38319   EVT VT = N->getValueType(0);
38320   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38321   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38322       !VT.getSimpleVT().isFloatingPoint())
38323     return false;
38324 
38325   // We only handle target-independent shuffles.
38326   // FIXME: It would be easy and harmless to use the target shuffle mask
38327   // extraction tool to support more.
38328   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38329     return false;
38330 
38331   SDValue V1 = N->getOperand(0);
38332   SDValue V2 = N->getOperand(1);
38333 
38334   // Make sure we have an FADD and an FSUB.
38335   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
38336       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
38337       V1.getOpcode() == V2.getOpcode())
38338     return false;
38339 
38340   // If there are other uses of these operations we can't fold them.
38341   if (!V1->hasOneUse() || !V2->hasOneUse())
38342     return false;
38343 
38344   // Ensure that both operations have the same operands. Note that we can
38345   // commute the FADD operands.
38346   SDValue LHS, RHS;
38347   if (V1.getOpcode() == ISD::FSUB) {
38348     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
38349     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
38350         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
38351       return false;
38352   } else {
38353     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
38354     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
38355     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
38356         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
38357       return false;
38358   }
38359 
38360   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38361   bool Op0Even;
38362   if (!isAddSubOrSubAddMask(Mask, Op0Even))
38363     return false;
38364 
38365   // It's a subadd if the vector in the even parity is an FADD.
38366   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
38367                      : V2->getOpcode() == ISD::FADD;
38368 
38369   Opnd0 = LHS;
38370   Opnd1 = RHS;
38371   return true;
38372 }
38373 
38374 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
38375 static SDValue combineShuffleToFMAddSub(SDNode *N,
38376                                         const X86Subtarget &Subtarget,
38377                                         SelectionDAG &DAG) {
38378   // We only handle target-independent shuffles.
38379   // FIXME: It would be easy and harmless to use the target shuffle mask
38380   // extraction tool to support more.
38381   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38382     return SDValue();
38383 
38384   MVT VT = N->getSimpleValueType(0);
38385   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38386   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
38387     return SDValue();
38388 
38389   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
38390   SDValue Op0 = N->getOperand(0);
38391   SDValue Op1 = N->getOperand(1);
38392   SDValue FMAdd = Op0, FMSub = Op1;
38393   if (FMSub.getOpcode() != X86ISD::FMSUB)
38394     std::swap(FMAdd, FMSub);
38395 
38396   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
38397       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
38398       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
38399       FMAdd.getOperand(2) != FMSub.getOperand(2))
38400     return SDValue();
38401 
38402   // Check for correct shuffle mask.
38403   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38404   bool Op0Even;
38405   if (!isAddSubOrSubAddMask(Mask, Op0Even))
38406     return SDValue();
38407 
38408   // FMAddSub takes zeroth operand from FMSub node.
38409   SDLoc DL(N);
38410   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
38411   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38412   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
38413                      FMAdd.getOperand(2));
38414 }
38415 
38416 /// Try to combine a shuffle into a target-specific add-sub or
38417 /// mul-add-sub node.
38418 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
38419                                                 const X86Subtarget &Subtarget,
38420                                                 SelectionDAG &DAG) {
38421   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
38422     return V;
38423 
38424   SDValue Opnd0, Opnd1;
38425   bool IsSubAdd;
38426   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
38427     return SDValue();
38428 
38429   MVT VT = N->getSimpleValueType(0);
38430   SDLoc DL(N);
38431 
38432   // Try to generate X86ISD::FMADDSUB node here.
38433   SDValue Opnd2;
38434   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
38435     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38436     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
38437   }
38438 
38439   if (IsSubAdd)
38440     return SDValue();
38441 
38442   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
38443   // the ADDSUB idiom has been successfully recognized. There are no known
38444   // X86 targets with 512-bit ADDSUB instructions!
38445   if (VT.is512BitVector())
38446     return SDValue();
38447 
38448   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
38449 }
38450 
38451 // We are looking for a shuffle where both sources are concatenated with undef
38452 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
38453 // if we can express this as a single-source shuffle, that's preferable.
38454 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
38455                                            const X86Subtarget &Subtarget) {
38456   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
38457     return SDValue();
38458 
38459   EVT VT = N->getValueType(0);
38460 
38461   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
38462   if (!VT.is128BitVector() && !VT.is256BitVector())
38463     return SDValue();
38464 
38465   if (VT.getVectorElementType() != MVT::i32 &&
38466       VT.getVectorElementType() != MVT::i64 &&
38467       VT.getVectorElementType() != MVT::f32 &&
38468       VT.getVectorElementType() != MVT::f64)
38469     return SDValue();
38470 
38471   SDValue N0 = N->getOperand(0);
38472   SDValue N1 = N->getOperand(1);
38473 
38474   // Check that both sources are concats with undef.
38475   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
38476       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
38477       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
38478       !N1.getOperand(1).isUndef())
38479     return SDValue();
38480 
38481   // Construct the new shuffle mask. Elements from the first source retain their
38482   // index, but elements from the second source no longer need to skip an undef.
38483   SmallVector<int, 8> Mask;
38484   int NumElts = VT.getVectorNumElements();
38485 
38486   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
38487   for (int Elt : SVOp->getMask())
38488     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
38489 
38490   SDLoc DL(N);
38491   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
38492                                N1.getOperand(0));
38493   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
38494 }
38495 
38496 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
38497 /// low half of each source vector and does not set any high half elements in
38498 /// the destination vector, narrow the shuffle to half its original size.
38499 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
38500   if (!Shuf->getValueType(0).isSimple())
38501     return SDValue();
38502   MVT VT = Shuf->getSimpleValueType(0);
38503   if (!VT.is256BitVector() && !VT.is512BitVector())
38504     return SDValue();
38505 
38506   // See if we can ignore all of the high elements of the shuffle.
38507   ArrayRef<int> Mask = Shuf->getMask();
38508   if (!isUndefUpperHalf(Mask))
38509     return SDValue();
38510 
38511   // Check if the shuffle mask accesses only the low half of each input vector
38512   // (half-index output is 0 or 2).
38513   int HalfIdx1, HalfIdx2;
38514   SmallVector<int, 8> HalfMask(Mask.size() / 2);
38515   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
38516       (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
38517     return SDValue();
38518 
38519   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
38520   // The trick is knowing that all of the insert/extract are actually free
38521   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
38522   // of narrow inputs into a narrow output, and that is always cheaper than
38523   // the wide shuffle that we started with.
38524   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
38525                                Shuf->getOperand(1), HalfMask, HalfIdx1,
38526                                HalfIdx2, false, DAG, /*UseConcat*/true);
38527 }
38528 
38529 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
38530                               TargetLowering::DAGCombinerInfo &DCI,
38531                               const X86Subtarget &Subtarget) {
38532   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
38533     if (SDValue V = narrowShuffle(Shuf, DAG))
38534       return V;
38535 
38536   // If we have legalized the vector types, look for blends of FADD and FSUB
38537   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
38538   SDLoc dl(N);
38539   EVT VT = N->getValueType(0);
38540   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38541   if (TLI.isTypeLegal(VT))
38542     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
38543       return AddSub;
38544 
38545   // Attempt to combine into a vector load/broadcast.
38546   if (SDValue LD = combineToConsecutiveLoads(
38547           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
38548     return LD;
38549 
38550   // For AVX2, we sometimes want to combine
38551   // (vector_shuffle <mask> (concat_vectors t1, undef)
38552   //                        (concat_vectors t2, undef))
38553   // Into:
38554   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
38555   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
38556   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
38557     return ShufConcat;
38558 
38559   if (isTargetShuffle(N->getOpcode())) {
38560     SDValue Op(N, 0);
38561     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
38562       return Shuffle;
38563 
38564     // Try recursively combining arbitrary sequences of x86 shuffle
38565     // instructions into higher-order shuffles. We do this after combining
38566     // specific PSHUF instruction sequences into their minimal form so that we
38567     // can evaluate how many specialized shuffle instructions are involved in
38568     // a particular chain.
38569     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38570       return Res;
38571 
38572     // Simplify source operands based on shuffle mask.
38573     // TODO - merge this into combineX86ShufflesRecursively.
38574     APInt KnownUndef, KnownZero;
38575     APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38576     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
38577                                        DCI))
38578       return SDValue(N, 0);
38579   }
38580 
38581   return SDValue();
38582 }
38583 
38584 // Simplify variable target shuffle masks based on the demanded elements.
38585 // TODO: Handle DemandedBits in mask indices as well?
38586 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
38587     SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
38588     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
38589   // If we're demanding all elements don't bother trying to simplify the mask.
38590   unsigned NumElts = DemandedElts.getBitWidth();
38591   if (DemandedElts.isAllOnesValue())
38592     return false;
38593 
38594   SDValue Mask = Op.getOperand(MaskIndex);
38595   if (!Mask.hasOneUse())
38596     return false;
38597 
38598   // Attempt to generically simplify the variable shuffle mask.
38599   APInt MaskUndef, MaskZero;
38600   if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
38601                                  Depth + 1))
38602     return true;
38603 
38604   // Attempt to extract+simplify a (constant pool load) shuffle mask.
38605   // TODO: Support other types from getTargetShuffleMaskIndices?
38606   SDValue BC = peekThroughOneUseBitcasts(Mask);
38607   EVT BCVT = BC.getValueType();
38608   auto *Load = dyn_cast<LoadSDNode>(BC);
38609   if (!Load)
38610     return false;
38611 
38612   const Constant *C = getTargetConstantFromNode(Load);
38613   if (!C)
38614     return false;
38615 
38616   Type *CTy = C->getType();
38617   if (!CTy->isVectorTy() ||
38618       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
38619     return false;
38620 
38621   // Handle scaling for i64 elements on 32-bit targets.
38622   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
38623   if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
38624     return false;
38625   unsigned Scale = NumCstElts / NumElts;
38626 
38627   // Simplify mask if we have an undemanded element that is not undef.
38628   bool Simplified = false;
38629   SmallVector<Constant *, 32> ConstVecOps;
38630   for (unsigned i = 0; i != NumCstElts; ++i) {
38631     Constant *Elt = C->getAggregateElement(i);
38632     if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
38633       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
38634       Simplified = true;
38635       continue;
38636     }
38637     ConstVecOps.push_back(Elt);
38638   }
38639   if (!Simplified)
38640     return false;
38641 
38642   // Generate new constant pool entry + legalize immediately for the load.
38643   SDLoc DL(Op);
38644   SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
38645   SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
38646   SDValue NewMask = TLO.DAG.getLoad(
38647       BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
38648       MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
38649       Load->getAlign());
38650   return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
38651 }
38652 
38653 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
38654     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
38655     TargetLoweringOpt &TLO, unsigned Depth) const {
38656   int NumElts = DemandedElts.getBitWidth();
38657   unsigned Opc = Op.getOpcode();
38658   EVT VT = Op.getValueType();
38659 
38660   // Handle special case opcodes.
38661   switch (Opc) {
38662   case X86ISD::PMULDQ:
38663   case X86ISD::PMULUDQ: {
38664     APInt LHSUndef, LHSZero;
38665     APInt RHSUndef, RHSZero;
38666     SDValue LHS = Op.getOperand(0);
38667     SDValue RHS = Op.getOperand(1);
38668     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
38669                                    Depth + 1))
38670       return true;
38671     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
38672                                    Depth + 1))
38673       return true;
38674     // Multiply by zero.
38675     KnownZero = LHSZero | RHSZero;
38676     break;
38677   }
38678   case X86ISD::VSHL:
38679   case X86ISD::VSRL:
38680   case X86ISD::VSRA: {
38681     // We only need the bottom 64-bits of the (128-bit) shift amount.
38682     SDValue Amt = Op.getOperand(1);
38683     MVT AmtVT = Amt.getSimpleValueType();
38684     assert(AmtVT.is128BitVector() && "Unexpected value type");
38685 
38686     // If we reuse the shift amount just for sse shift amounts then we know that
38687     // only the bottom 64-bits are only ever used.
38688     bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
38689       unsigned UseOpc = Use->getOpcode();
38690       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
38691               UseOpc == X86ISD::VSRA) &&
38692              Use->getOperand(0) != Amt;
38693     });
38694 
38695     APInt AmtUndef, AmtZero;
38696     unsigned NumAmtElts = AmtVT.getVectorNumElements();
38697     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
38698     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
38699                                    Depth + 1, AssumeSingleUse))
38700       return true;
38701     LLVM_FALLTHROUGH;
38702   }
38703   case X86ISD::VSHLI:
38704   case X86ISD::VSRLI:
38705   case X86ISD::VSRAI: {
38706     SDValue Src = Op.getOperand(0);
38707     APInt SrcUndef;
38708     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
38709                                    Depth + 1))
38710       return true;
38711 
38712     // Aggressively peek through ops to get at the demanded elts.
38713     if (!DemandedElts.isAllOnesValue())
38714       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38715               Src, DemandedElts, TLO.DAG, Depth + 1))
38716         return TLO.CombineTo(
38717             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
38718     break;
38719   }
38720   case X86ISD::KSHIFTL: {
38721     SDValue Src = Op.getOperand(0);
38722     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38723     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38724     unsigned ShiftAmt = Amt->getZExtValue();
38725 
38726     if (ShiftAmt == 0)
38727       return TLO.CombineTo(Op, Src);
38728 
38729     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38730     // single shift.  We can do this if the bottom bits (which are shifted
38731     // out) are never demanded.
38732     if (Src.getOpcode() == X86ISD::KSHIFTR) {
38733       if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
38734         unsigned C1 = Src.getConstantOperandVal(1);
38735         unsigned NewOpc = X86ISD::KSHIFTL;
38736         int Diff = ShiftAmt - C1;
38737         if (Diff < 0) {
38738           Diff = -Diff;
38739           NewOpc = X86ISD::KSHIFTR;
38740         }
38741 
38742         SDLoc dl(Op);
38743         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38744         return TLO.CombineTo(
38745             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38746       }
38747     }
38748 
38749     APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
38750     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38751                                    Depth + 1))
38752       return true;
38753 
38754     KnownUndef <<= ShiftAmt;
38755     KnownZero <<= ShiftAmt;
38756     KnownZero.setLowBits(ShiftAmt);
38757     break;
38758   }
38759   case X86ISD::KSHIFTR: {
38760     SDValue Src = Op.getOperand(0);
38761     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38762     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
38763     unsigned ShiftAmt = Amt->getZExtValue();
38764 
38765     if (ShiftAmt == 0)
38766       return TLO.CombineTo(Op, Src);
38767 
38768     // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
38769     // single shift.  We can do this if the top bits (which are shifted
38770     // out) are never demanded.
38771     if (Src.getOpcode() == X86ISD::KSHIFTL) {
38772       if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
38773         unsigned C1 = Src.getConstantOperandVal(1);
38774         unsigned NewOpc = X86ISD::KSHIFTR;
38775         int Diff = ShiftAmt - C1;
38776         if (Diff < 0) {
38777           Diff = -Diff;
38778           NewOpc = X86ISD::KSHIFTL;
38779         }
38780 
38781         SDLoc dl(Op);
38782         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38783         return TLO.CombineTo(
38784             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38785       }
38786     }
38787 
38788     APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
38789     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38790                                    Depth + 1))
38791       return true;
38792 
38793     KnownUndef.lshrInPlace(ShiftAmt);
38794     KnownZero.lshrInPlace(ShiftAmt);
38795     KnownZero.setHighBits(ShiftAmt);
38796     break;
38797   }
38798   case X86ISD::CVTSI2P:
38799   case X86ISD::CVTUI2P: {
38800     SDValue Src = Op.getOperand(0);
38801     MVT SrcVT = Src.getSimpleValueType();
38802     APInt SrcUndef, SrcZero;
38803     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38804     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38805                                    Depth + 1))
38806       return true;
38807     break;
38808   }
38809   case X86ISD::PACKSS:
38810   case X86ISD::PACKUS: {
38811     SDValue N0 = Op.getOperand(0);
38812     SDValue N1 = Op.getOperand(1);
38813 
38814     APInt DemandedLHS, DemandedRHS;
38815     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38816 
38817     APInt LHSUndef, LHSZero;
38818     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38819                                    Depth + 1))
38820       return true;
38821     APInt RHSUndef, RHSZero;
38822     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38823                                    Depth + 1))
38824       return true;
38825 
38826     // TODO - pass on known zero/undef.
38827 
38828     // Aggressively peek through ops to get at the demanded elts.
38829     // TODO - we should do this for all target/faux shuffles ops.
38830     if (!DemandedElts.isAllOnesValue()) {
38831       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38832                                                             TLO.DAG, Depth + 1);
38833       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38834                                                             TLO.DAG, Depth + 1);
38835       if (NewN0 || NewN1) {
38836         NewN0 = NewN0 ? NewN0 : N0;
38837         NewN1 = NewN1 ? NewN1 : N1;
38838         return TLO.CombineTo(Op,
38839                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38840       }
38841     }
38842     break;
38843   }
38844   case X86ISD::HADD:
38845   case X86ISD::HSUB:
38846   case X86ISD::FHADD:
38847   case X86ISD::FHSUB: {
38848     SDValue N0 = Op.getOperand(0);
38849     SDValue N1 = Op.getOperand(1);
38850 
38851     APInt DemandedLHS, DemandedRHS;
38852     getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38853 
38854     APInt LHSUndef, LHSZero;
38855     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38856                                    Depth + 1))
38857       return true;
38858     APInt RHSUndef, RHSZero;
38859     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38860                                    Depth + 1))
38861       return true;
38862 
38863     // TODO - pass on known zero/undef.
38864 
38865     // Aggressively peek through ops to get at the demanded elts.
38866     // TODO: Handle repeated operands.
38867     if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
38868       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38869                                                             TLO.DAG, Depth + 1);
38870       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38871                                                             TLO.DAG, Depth + 1);
38872       if (NewN0 || NewN1) {
38873         NewN0 = NewN0 ? NewN0 : N0;
38874         NewN1 = NewN1 ? NewN1 : N1;
38875         return TLO.CombineTo(Op,
38876                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38877       }
38878     }
38879     break;
38880   }
38881   case X86ISD::VTRUNC:
38882   case X86ISD::VTRUNCS:
38883   case X86ISD::VTRUNCUS: {
38884     SDValue Src = Op.getOperand(0);
38885     MVT SrcVT = Src.getSimpleValueType();
38886     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38887     APInt SrcUndef, SrcZero;
38888     if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
38889                                    Depth + 1))
38890       return true;
38891     KnownZero = SrcZero.zextOrTrunc(NumElts);
38892     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
38893     break;
38894   }
38895   case X86ISD::BLENDV: {
38896     APInt SelUndef, SelZero;
38897     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
38898                                    SelZero, TLO, Depth + 1))
38899       return true;
38900 
38901     // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
38902     APInt LHSUndef, LHSZero;
38903     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
38904                                    LHSZero, TLO, Depth + 1))
38905       return true;
38906 
38907     APInt RHSUndef, RHSZero;
38908     if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
38909                                    RHSZero, TLO, Depth + 1))
38910       return true;
38911 
38912     KnownZero = LHSZero & RHSZero;
38913     KnownUndef = LHSUndef & RHSUndef;
38914     break;
38915   }
38916   case X86ISD::VZEXT_MOVL: {
38917     // If upper demanded elements are already zero then we have nothing to do.
38918     SDValue Src = Op.getOperand(0);
38919     APInt DemandedUpperElts = DemandedElts;
38920     DemandedUpperElts.clearLowBits(1);
38921     if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
38922       return TLO.CombineTo(Op, Src);
38923     break;
38924   }
38925   case X86ISD::VBROADCAST: {
38926     SDValue Src = Op.getOperand(0);
38927     MVT SrcVT = Src.getSimpleValueType();
38928     if (!SrcVT.isVector())
38929       break;
38930     // Don't bother broadcasting if we just need the 0'th element.
38931     if (DemandedElts == 1) {
38932       if (Src.getValueType() != VT)
38933         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
38934                              SDLoc(Op));
38935       return TLO.CombineTo(Op, Src);
38936     }
38937     APInt SrcUndef, SrcZero;
38938     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
38939     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38940                                    Depth + 1))
38941       return true;
38942     // Aggressively peek through src to get at the demanded elt.
38943     // TODO - we should do this for all target/faux shuffles ops.
38944     if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38945             Src, SrcElts, TLO.DAG, Depth + 1))
38946       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38947     break;
38948   }
38949   case X86ISD::VPERMV:
38950     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
38951                                                    Depth))
38952       return true;
38953     break;
38954   case X86ISD::PSHUFB:
38955   case X86ISD::VPERMV3:
38956   case X86ISD::VPERMILPV:
38957     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
38958                                                    Depth))
38959       return true;
38960     break;
38961   case X86ISD::VPPERM:
38962   case X86ISD::VPERMIL2:
38963     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38964                                                    Depth))
38965       return true;
38966     break;
38967   }
38968 
38969   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38970   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38971   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38972   if ((VT.is256BitVector() || VT.is512BitVector()) &&
38973       DemandedElts.lshr(NumElts / 2) == 0) {
38974     unsigned SizeInBits = VT.getSizeInBits();
38975     unsigned ExtSizeInBits = SizeInBits / 2;
38976 
38977     // See if 512-bit ops only use the bottom 128-bits.
38978     if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38979       ExtSizeInBits = SizeInBits / 4;
38980 
38981     switch (Opc) {
38982       // Scalar broadcast.
38983     case X86ISD::VBROADCAST: {
38984       SDLoc DL(Op);
38985       SDValue Src = Op.getOperand(0);
38986       if (Src.getValueSizeInBits() > ExtSizeInBits)
38987         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38988       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38989                                     ExtSizeInBits / VT.getScalarSizeInBits());
38990       SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38991       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38992                                                TLO.DAG, DL, ExtSizeInBits));
38993     }
38994     case X86ISD::VBROADCAST_LOAD: {
38995       SDLoc DL(Op);
38996       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38997       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38998                                     ExtSizeInBits / VT.getScalarSizeInBits());
38999       SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39000       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39001       SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
39002           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
39003           MemIntr->getMemOperand());
39004       TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39005                                            Bcst.getValue(1));
39006       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39007                                                TLO.DAG, DL, ExtSizeInBits));
39008     }
39009       // Subvector broadcast.
39010     case X86ISD::SUBV_BROADCAST_LOAD: {
39011       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39012       EVT MemVT = MemIntr->getMemoryVT();
39013       if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
39014         SDLoc DL(Op);
39015         SDValue Ld =
39016             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
39017                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
39018         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39019                                              Ld.getValue(1));
39020         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
39021                                                  TLO.DAG, DL, ExtSizeInBits));
39022       } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
39023         SDLoc DL(Op);
39024         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39025                                       ExtSizeInBits / VT.getScalarSizeInBits());
39026         SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39027         SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39028         SDValue Bcst =
39029             TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
39030                                         Ops, MemVT, MemIntr->getMemOperand());
39031         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39032                                              Bcst.getValue(1));
39033         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39034                                                  TLO.DAG, DL, ExtSizeInBits));
39035       }
39036       break;
39037     }
39038       // Byte shifts by immediate.
39039     case X86ISD::VSHLDQ:
39040     case X86ISD::VSRLDQ:
39041       // Shift by uniform.
39042     case X86ISD::VSHL:
39043     case X86ISD::VSRL:
39044     case X86ISD::VSRA:
39045       // Shift by immediate.
39046     case X86ISD::VSHLI:
39047     case X86ISD::VSRLI:
39048     case X86ISD::VSRAI: {
39049       SDLoc DL(Op);
39050       SDValue Ext0 =
39051           extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
39052       SDValue ExtOp =
39053           TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
39054       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39055       SDValue Insert =
39056           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39057       return TLO.CombineTo(Op, Insert);
39058     }
39059     case X86ISD::VPERMI: {
39060       // Simplify PERMPD/PERMQ to extract_subvector.
39061       // TODO: This should be done in shuffle combining.
39062       if (VT == MVT::v4f64 || VT == MVT::v4i64) {
39063         SmallVector<int, 4> Mask;
39064         DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
39065         if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
39066           SDLoc DL(Op);
39067           SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
39068           SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39069           SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
39070           return TLO.CombineTo(Op, Insert);
39071         }
39072       }
39073       break;
39074     }
39075     case X86ISD::VPERM2X128: {
39076       // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
39077       SDLoc DL(Op);
39078       unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
39079       if (LoMask & 0x8)
39080         return TLO.CombineTo(
39081             Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
39082       unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
39083       unsigned SrcIdx = (LoMask & 0x2) >> 1;
39084       SDValue ExtOp =
39085           extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
39086       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39087       SDValue Insert =
39088           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39089       return TLO.CombineTo(Op, Insert);
39090     }
39091       // Zero upper elements.
39092     case X86ISD::VZEXT_MOVL:
39093       // Target unary shuffles by immediate:
39094     case X86ISD::PSHUFD:
39095     case X86ISD::PSHUFLW:
39096     case X86ISD::PSHUFHW:
39097     case X86ISD::VPERMILPI:
39098       // (Non-Lane Crossing) Target Shuffles.
39099     case X86ISD::VPERMILPV:
39100     case X86ISD::VPERMIL2:
39101     case X86ISD::PSHUFB:
39102     case X86ISD::UNPCKL:
39103     case X86ISD::UNPCKH:
39104     case X86ISD::BLENDI:
39105       // Integer ops.
39106     case X86ISD::AVG:
39107     case X86ISD::PACKSS:
39108     case X86ISD::PACKUS:
39109       // Horizontal Ops.
39110     case X86ISD::HADD:
39111     case X86ISD::HSUB:
39112     case X86ISD::FHADD:
39113     case X86ISD::FHSUB: {
39114       SDLoc DL(Op);
39115       SmallVector<SDValue, 4> Ops;
39116       for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
39117         SDValue SrcOp = Op.getOperand(i);
39118         EVT SrcVT = SrcOp.getValueType();
39119         assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
39120                "Unsupported vector size");
39121         Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
39122                                                           ExtSizeInBits)
39123                                        : SrcOp);
39124       }
39125       MVT ExtVT = VT.getSimpleVT();
39126       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
39127                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
39128       SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
39129       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39130       SDValue Insert =
39131           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39132       return TLO.CombineTo(Op, Insert);
39133     }
39134     }
39135   }
39136 
39137   // Get target/faux shuffle mask.
39138   APInt OpUndef, OpZero;
39139   SmallVector<int, 64> OpMask;
39140   SmallVector<SDValue, 2> OpInputs;
39141   if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
39142                               OpZero, TLO.DAG, Depth, false))
39143     return false;
39144 
39145   // Shuffle inputs must be the same size as the result.
39146   if (OpMask.size() != (unsigned)NumElts ||
39147       llvm::any_of(OpInputs, [VT](SDValue V) {
39148         return VT.getSizeInBits() != V.getValueSizeInBits() ||
39149                !V.getValueType().isVector();
39150       }))
39151     return false;
39152 
39153   KnownZero = OpZero;
39154   KnownUndef = OpUndef;
39155 
39156   // Check if shuffle mask can be simplified to undef/zero/identity.
39157   int NumSrcs = OpInputs.size();
39158   for (int i = 0; i != NumElts; ++i)
39159     if (!DemandedElts[i])
39160       OpMask[i] = SM_SentinelUndef;
39161 
39162   if (isUndefInRange(OpMask, 0, NumElts)) {
39163     KnownUndef.setAllBits();
39164     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
39165   }
39166   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
39167     KnownZero.setAllBits();
39168     return TLO.CombineTo(
39169         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
39170   }
39171   for (int Src = 0; Src != NumSrcs; ++Src)
39172     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
39173       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
39174 
39175   // Attempt to simplify inputs.
39176   for (int Src = 0; Src != NumSrcs; ++Src) {
39177     // TODO: Support inputs of different types.
39178     if (OpInputs[Src].getValueType() != VT)
39179       continue;
39180 
39181     int Lo = Src * NumElts;
39182     APInt SrcElts = APInt::getNullValue(NumElts);
39183     for (int i = 0; i != NumElts; ++i)
39184       if (DemandedElts[i]) {
39185         int M = OpMask[i] - Lo;
39186         if (0 <= M && M < NumElts)
39187           SrcElts.setBit(M);
39188       }
39189 
39190     // TODO - Propagate input undef/zero elts.
39191     APInt SrcUndef, SrcZero;
39192     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
39193                                    TLO, Depth + 1))
39194       return true;
39195   }
39196 
39197   // If we don't demand all elements, then attempt to combine to a simpler
39198   // shuffle.
39199   // We need to convert the depth to something combineX86ShufflesRecursively
39200   // can handle - so pretend its Depth == 0 again, and reduce the max depth
39201   // to match. This prevents combineX86ShuffleChain from returning a
39202   // combined shuffle that's the same as the original root, causing an
39203   // infinite loop.
39204   if (!DemandedElts.isAllOnesValue()) {
39205     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
39206 
39207     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
39208     for (int i = 0; i != NumElts; ++i)
39209       if (DemandedElts[i])
39210         DemandedMask[i] = i;
39211 
39212     SDValue NewShuffle = combineX86ShufflesRecursively(
39213         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
39214         /*HasVarMask*/ false,
39215         /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
39216         Subtarget);
39217     if (NewShuffle)
39218       return TLO.CombineTo(Op, NewShuffle);
39219   }
39220 
39221   return false;
39222 }
39223 
39224 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
39225     SDValue Op, const APInt &OriginalDemandedBits,
39226     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
39227     unsigned Depth) const {
39228   EVT VT = Op.getValueType();
39229   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
39230   unsigned Opc = Op.getOpcode();
39231   switch(Opc) {
39232   case X86ISD::VTRUNC: {
39233     KnownBits KnownOp;
39234     SDValue Src = Op.getOperand(0);
39235     MVT SrcVT = Src.getSimpleValueType();
39236 
39237     // Simplify the input, using demanded bit information.
39238     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
39239     APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
39240     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
39241       return true;
39242     break;
39243   }
39244   case X86ISD::PMULDQ:
39245   case X86ISD::PMULUDQ: {
39246     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
39247     KnownBits KnownOp;
39248     SDValue LHS = Op.getOperand(0);
39249     SDValue RHS = Op.getOperand(1);
39250     // FIXME: Can we bound this better?
39251     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
39252     if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
39253                              TLO, Depth + 1))
39254       return true;
39255     if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
39256                              TLO, Depth + 1))
39257       return true;
39258 
39259     // Aggressively peek through ops to get at the demanded low bits.
39260     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39261         LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39262     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39263         RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39264     if (DemandedLHS || DemandedRHS) {
39265       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39266       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39267       return TLO.CombineTo(
39268           Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39269     }
39270     break;
39271   }
39272   case X86ISD::VSHLI: {
39273     SDValue Op0 = Op.getOperand(0);
39274 
39275     unsigned ShAmt = Op.getConstantOperandVal(1);
39276     if (ShAmt >= BitWidth)
39277       break;
39278 
39279     APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39280 
39281     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39282     // single shift.  We can do this if the bottom bits (which are shifted
39283     // out) are never demanded.
39284     if (Op0.getOpcode() == X86ISD::VSRLI &&
39285         OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39286       unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39287       if (Shift2Amt < BitWidth) {
39288         int Diff = ShAmt - Shift2Amt;
39289         if (Diff == 0)
39290           return TLO.CombineTo(Op, Op0.getOperand(0));
39291 
39292         unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39293         SDValue NewShift = TLO.DAG.getNode(
39294             NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39295             TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39296         return TLO.CombineTo(Op, NewShift);
39297       }
39298     }
39299 
39300     // If we are only demanding sign bits then we can use the shift source directly.
39301     unsigned NumSignBits =
39302         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39303     unsigned UpperDemandedBits =
39304         BitWidth - OriginalDemandedBits.countTrailingZeros();
39305     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39306       return TLO.CombineTo(Op, Op0);
39307 
39308     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39309                              TLO, Depth + 1))
39310       return true;
39311 
39312     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39313     Known.Zero <<= ShAmt;
39314     Known.One <<= ShAmt;
39315 
39316     // Low bits known zero.
39317     Known.Zero.setLowBits(ShAmt);
39318     return false;
39319   }
39320   case X86ISD::VSRLI: {
39321     unsigned ShAmt = Op.getConstantOperandVal(1);
39322     if (ShAmt >= BitWidth)
39323       break;
39324 
39325     APInt DemandedMask = OriginalDemandedBits << ShAmt;
39326 
39327     if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
39328                              OriginalDemandedElts, Known, TLO, Depth + 1))
39329       return true;
39330 
39331     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39332     Known.Zero.lshrInPlace(ShAmt);
39333     Known.One.lshrInPlace(ShAmt);
39334 
39335     // High bits known zero.
39336     Known.Zero.setHighBits(ShAmt);
39337     return false;
39338   }
39339   case X86ISD::VSRAI: {
39340     SDValue Op0 = Op.getOperand(0);
39341     SDValue Op1 = Op.getOperand(1);
39342 
39343     unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
39344     if (ShAmt >= BitWidth)
39345       break;
39346 
39347     APInt DemandedMask = OriginalDemandedBits << ShAmt;
39348 
39349     // If we just want the sign bit then we don't need to shift it.
39350     if (OriginalDemandedBits.isSignMask())
39351       return TLO.CombineTo(Op, Op0);
39352 
39353     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
39354     if (Op0.getOpcode() == X86ISD::VSHLI &&
39355         Op.getOperand(1) == Op0.getOperand(1)) {
39356       SDValue Op00 = Op0.getOperand(0);
39357       unsigned NumSignBits =
39358           TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
39359       if (ShAmt < NumSignBits)
39360         return TLO.CombineTo(Op, Op00);
39361     }
39362 
39363     // If any of the demanded bits are produced by the sign extension, we also
39364     // demand the input sign bit.
39365     if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
39366       DemandedMask.setSignBit();
39367 
39368     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39369                              TLO, Depth + 1))
39370       return true;
39371 
39372     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
39373     Known.Zero.lshrInPlace(ShAmt);
39374     Known.One.lshrInPlace(ShAmt);
39375 
39376     // If the input sign bit is known to be zero, or if none of the top bits
39377     // are demanded, turn this into an unsigned shift right.
39378     if (Known.Zero[BitWidth - ShAmt - 1] ||
39379         OriginalDemandedBits.countLeadingZeros() >= ShAmt)
39380       return TLO.CombineTo(
39381           Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
39382 
39383     // High bits are known one.
39384     if (Known.One[BitWidth - ShAmt - 1])
39385       Known.One.setHighBits(ShAmt);
39386     return false;
39387   }
39388   case X86ISD::PEXTRB:
39389   case X86ISD::PEXTRW: {
39390     SDValue Vec = Op.getOperand(0);
39391     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
39392     MVT VecVT = Vec.getSimpleValueType();
39393     unsigned NumVecElts = VecVT.getVectorNumElements();
39394 
39395     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
39396       unsigned Idx = CIdx->getZExtValue();
39397       unsigned VecBitWidth = VecVT.getScalarSizeInBits();
39398 
39399       // If we demand no bits from the vector then we must have demanded
39400       // bits from the implict zext - simplify to zero.
39401       APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
39402       if (DemandedVecBits == 0)
39403         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39404 
39405       APInt KnownUndef, KnownZero;
39406       APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
39407       if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
39408                                      KnownZero, TLO, Depth + 1))
39409         return true;
39410 
39411       KnownBits KnownVec;
39412       if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
39413                                KnownVec, TLO, Depth + 1))
39414         return true;
39415 
39416       if (SDValue V = SimplifyMultipleUseDemandedBits(
39417               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
39418         return TLO.CombineTo(
39419             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
39420 
39421       Known = KnownVec.zext(BitWidth);
39422       return false;
39423     }
39424     break;
39425   }
39426   case X86ISD::PINSRB:
39427   case X86ISD::PINSRW: {
39428     SDValue Vec = Op.getOperand(0);
39429     SDValue Scl = Op.getOperand(1);
39430     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39431     MVT VecVT = Vec.getSimpleValueType();
39432 
39433     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
39434       unsigned Idx = CIdx->getZExtValue();
39435       if (!OriginalDemandedElts[Idx])
39436         return TLO.CombineTo(Op, Vec);
39437 
39438       KnownBits KnownVec;
39439       APInt DemandedVecElts(OriginalDemandedElts);
39440       DemandedVecElts.clearBit(Idx);
39441       if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
39442                                KnownVec, TLO, Depth + 1))
39443         return true;
39444 
39445       KnownBits KnownScl;
39446       unsigned NumSclBits = Scl.getScalarValueSizeInBits();
39447       APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
39448       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
39449         return true;
39450 
39451       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
39452       Known = KnownBits::commonBits(KnownVec, KnownScl);
39453       return false;
39454     }
39455     break;
39456   }
39457   case X86ISD::PACKSS:
39458     // PACKSS saturates to MIN/MAX integer values. So if we just want the
39459     // sign bit then we can just ask for the source operands sign bit.
39460     // TODO - add known bits handling.
39461     if (OriginalDemandedBits.isSignMask()) {
39462       APInt DemandedLHS, DemandedRHS;
39463       getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
39464 
39465       KnownBits KnownLHS, KnownRHS;
39466       APInt SignMask = APInt::getSignMask(BitWidth * 2);
39467       if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
39468                                KnownLHS, TLO, Depth + 1))
39469         return true;
39470       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
39471                                KnownRHS, TLO, Depth + 1))
39472         return true;
39473 
39474       // Attempt to avoid multi-use ops if we don't need anything from them.
39475       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
39476           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
39477       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
39478           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
39479       if (DemandedOp0 || DemandedOp1) {
39480         SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
39481         SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
39482         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
39483       }
39484     }
39485     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
39486     break;
39487   case X86ISD::VBROADCAST: {
39488     SDValue Src = Op.getOperand(0);
39489     MVT SrcVT = Src.getSimpleValueType();
39490     APInt DemandedElts = APInt::getOneBitSet(
39491         SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
39492     if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
39493                              TLO, Depth + 1))
39494       return true;
39495     // If we don't need the upper bits, attempt to narrow the broadcast source.
39496     // Don't attempt this on AVX512 as it might affect broadcast folding.
39497     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
39498     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
39499         OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
39500       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
39501       SDValue NewSrc =
39502           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
39503       MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
39504       SDValue NewBcst =
39505           TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
39506       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
39507     }
39508     break;
39509   }
39510   case X86ISD::PCMPGT:
39511     // icmp sgt(0, R) == ashr(R, BitWidth-1).
39512     // iff we only need the sign bit then we can use R directly.
39513     if (OriginalDemandedBits.isSignMask() &&
39514         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39515       return TLO.CombineTo(Op, Op.getOperand(1));
39516     break;
39517   case X86ISD::MOVMSK: {
39518     SDValue Src = Op.getOperand(0);
39519     MVT SrcVT = Src.getSimpleValueType();
39520     unsigned SrcBits = SrcVT.getScalarSizeInBits();
39521     unsigned NumElts = SrcVT.getVectorNumElements();
39522 
39523     // If we don't need the sign bits at all just return zero.
39524     if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
39525       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39526 
39527     // Only demand the vector elements of the sign bits we need.
39528     APInt KnownUndef, KnownZero;
39529     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
39530     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
39531                                    TLO, Depth + 1))
39532       return true;
39533 
39534     Known.Zero = KnownZero.zextOrSelf(BitWidth);
39535     Known.Zero.setHighBits(BitWidth - NumElts);
39536 
39537     // MOVMSK only uses the MSB from each vector element.
39538     KnownBits KnownSrc;
39539     APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
39540     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
39541                              Depth + 1))
39542       return true;
39543 
39544     if (KnownSrc.One[SrcBits - 1])
39545       Known.One.setLowBits(NumElts);
39546     else if (KnownSrc.Zero[SrcBits - 1])
39547       Known.Zero.setLowBits(NumElts);
39548 
39549     // Attempt to avoid multi-use os if we don't need anything from it.
39550     if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
39551             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
39552       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39553     return false;
39554   }
39555   case X86ISD::BEXTR:
39556   case X86ISD::BEXTRI: {
39557     SDValue Op0 = Op.getOperand(0);
39558     SDValue Op1 = Op.getOperand(1);
39559 
39560     // Only bottom 16-bits of the control bits are required.
39561     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
39562       // NOTE: SimplifyDemandedBits won't do this for constants.
39563       uint64_t Val1 = Cst1->getZExtValue();
39564       uint64_t MaskedVal1 = Val1 & 0xFFFF;
39565       if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
39566         SDLoc DL(Op);
39567         return TLO.CombineTo(
39568             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
39569                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
39570       }
39571 
39572       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
39573       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
39574 
39575       // If the length is 0, the result is 0.
39576       if (Length == 0) {
39577         Known.setAllZero();
39578         return false;
39579       }
39580 
39581       if ((Shift + Length) <= BitWidth) {
39582         APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
39583         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
39584           return true;
39585 
39586         Known = Known.extractBits(Length, Shift);
39587         Known = Known.zextOrTrunc(BitWidth);
39588         return false;
39589       }
39590     } else {
39591       assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
39592       KnownBits Known1;
39593       APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
39594       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
39595         return true;
39596 
39597       // If the length is 0, replace with 0.
39598       KnownBits LengthBits = Known1.extractBits(8, 8);
39599       if (LengthBits.isZero())
39600         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39601     }
39602 
39603     break;
39604   }
39605   case X86ISD::PDEP: {
39606     SDValue Op0 = Op.getOperand(0);
39607     SDValue Op1 = Op.getOperand(1);
39608 
39609     unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
39610     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
39611 
39612     // If the demanded bits has leading zeroes, we don't demand those from the
39613     // mask.
39614     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
39615       return true;
39616 
39617     // The number of possible 1s in the mask determines the number of LSBs of
39618     // operand 0 used. Undemanded bits from the mask don't matter so filter
39619     // them before counting.
39620     KnownBits Known2;
39621     uint64_t Count = (~Known.Zero & LoMask).countPopulation();
39622     APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
39623     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
39624       return true;
39625 
39626     // Zeroes are retained from the mask, but not ones.
39627     Known.One.clearAllBits();
39628     // The result will have at least as many trailing zeros as the non-mask
39629     // operand since bits can only map to the same or higher bit position.
39630     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
39631     return false;
39632   }
39633   }
39634 
39635   return TargetLowering::SimplifyDemandedBitsForTargetNode(
39636       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
39637 }
39638 
39639 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39640     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
39641     SelectionDAG &DAG, unsigned Depth) const {
39642   int NumElts = DemandedElts.getBitWidth();
39643   unsigned Opc = Op.getOpcode();
39644   EVT VT = Op.getValueType();
39645 
39646   switch (Opc) {
39647   case X86ISD::PINSRB:
39648   case X86ISD::PINSRW: {
39649     // If we don't demand the inserted element, return the base vector.
39650     SDValue Vec = Op.getOperand(0);
39651     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39652     MVT VecVT = Vec.getSimpleValueType();
39653     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
39654         !DemandedElts[CIdx->getZExtValue()])
39655       return Vec;
39656     break;
39657   }
39658   case X86ISD::VSHLI: {
39659     // If we are only demanding sign bits then we can use the shift source
39660     // directly.
39661     SDValue Op0 = Op.getOperand(0);
39662     unsigned ShAmt = Op.getConstantOperandVal(1);
39663     unsigned BitWidth = DemandedBits.getBitWidth();
39664     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
39665     unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
39666     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39667       return Op0;
39668     break;
39669   }
39670   case X86ISD::VSRAI:
39671     // iff we only need the sign bit then we can use the source directly.
39672     // TODO: generalize where we only demand extended signbits.
39673     if (DemandedBits.isSignMask())
39674       return Op.getOperand(0);
39675     break;
39676   case X86ISD::PCMPGT:
39677     // icmp sgt(0, R) == ashr(R, BitWidth-1).
39678     // iff we only need the sign bit then we can use R directly.
39679     if (DemandedBits.isSignMask() &&
39680         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39681       return Op.getOperand(1);
39682     break;
39683   }
39684 
39685   APInt ShuffleUndef, ShuffleZero;
39686   SmallVector<int, 16> ShuffleMask;
39687   SmallVector<SDValue, 2> ShuffleOps;
39688   if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
39689                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
39690     // If all the demanded elts are from one operand and are inline,
39691     // then we can use the operand directly.
39692     int NumOps = ShuffleOps.size();
39693     if (ShuffleMask.size() == (unsigned)NumElts &&
39694         llvm::all_of(ShuffleOps, [VT](SDValue V) {
39695           return VT.getSizeInBits() == V.getValueSizeInBits();
39696         })) {
39697 
39698       if (DemandedElts.isSubsetOf(ShuffleUndef))
39699         return DAG.getUNDEF(VT);
39700       if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
39701         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
39702 
39703       // Bitmask that indicates which ops have only been accessed 'inline'.
39704       APInt IdentityOp = APInt::getAllOnesValue(NumOps);
39705       for (int i = 0; i != NumElts; ++i) {
39706         int M = ShuffleMask[i];
39707         if (!DemandedElts[i] || ShuffleUndef[i])
39708           continue;
39709         int OpIdx = M / NumElts;
39710         int EltIdx = M % NumElts;
39711         if (M < 0 || EltIdx != i) {
39712           IdentityOp.clearAllBits();
39713           break;
39714         }
39715         IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
39716         if (IdentityOp == 0)
39717           break;
39718       }
39719       assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
39720              "Multiple identity shuffles detected");
39721 
39722       if (IdentityOp != 0)
39723         return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
39724     }
39725   }
39726 
39727   return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39728       Op, DemandedBits, DemandedElts, DAG, Depth);
39729 }
39730 
39731 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
39732 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
39733 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
39734                                       bool AllowTruncate) {
39735   switch (Src.getOpcode()) {
39736   case ISD::TRUNCATE:
39737     if (!AllowTruncate)
39738       return false;
39739     LLVM_FALLTHROUGH;
39740   case ISD::SETCC:
39741     return Src.getOperand(0).getValueSizeInBits() == Size;
39742   case ISD::AND:
39743   case ISD::XOR:
39744   case ISD::OR:
39745     return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
39746            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
39747   }
39748   return false;
39749 }
39750 
39751 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
39752 static unsigned getAltBitOpcode(unsigned Opcode) {
39753   switch(Opcode) {
39754   case ISD::AND: return X86ISD::FAND;
39755   case ISD::OR: return X86ISD::FOR;
39756   case ISD::XOR: return X86ISD::FXOR;
39757   case X86ISD::ANDNP: return X86ISD::FANDN;
39758   }
39759   llvm_unreachable("Unknown bitwise opcode");
39760 }
39761 
39762 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
39763 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
39764                                           const SDLoc &DL) {
39765   EVT SrcVT = Src.getValueType();
39766   if (SrcVT != MVT::v4i1)
39767     return SDValue();
39768 
39769   switch (Src.getOpcode()) {
39770   case ISD::SETCC:
39771     if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
39772         ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39773         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
39774       SDValue Op0 = Src.getOperand(0);
39775       if (ISD::isNormalLoad(Op0.getNode()))
39776         return DAG.getBitcast(MVT::v4f32, Op0);
39777       if (Op0.getOpcode() == ISD::BITCAST &&
39778           Op0.getOperand(0).getValueType() == MVT::v4f32)
39779         return Op0.getOperand(0);
39780     }
39781     break;
39782   case ISD::AND:
39783   case ISD::XOR:
39784   case ISD::OR: {
39785     SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
39786     SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
39787     if (Op0 && Op1)
39788       return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
39789                          Op1);
39790     break;
39791   }
39792   }
39793   return SDValue();
39794 }
39795 
39796 // Helper to push sign extension of vXi1 SETCC result through bitops.
39797 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
39798                                           SDValue Src, const SDLoc &DL) {
39799   switch (Src.getOpcode()) {
39800   case ISD::SETCC:
39801   case ISD::TRUNCATE:
39802     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39803   case ISD::AND:
39804   case ISD::XOR:
39805   case ISD::OR:
39806     return DAG.getNode(
39807         Src.getOpcode(), DL, SExtVT,
39808         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
39809         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
39810   }
39811   llvm_unreachable("Unexpected node type for vXi1 sign extension");
39812 }
39813 
39814 // Try to match patterns such as
39815 // (i16 bitcast (v16i1 x))
39816 // ->
39817 // (i16 movmsk (16i8 sext (v16i1 x)))
39818 // before the illegal vector is scalarized on subtargets that don't have legal
39819 // vxi1 types.
39820 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
39821                                   const SDLoc &DL,
39822                                   const X86Subtarget &Subtarget) {
39823   EVT SrcVT = Src.getValueType();
39824   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
39825     return SDValue();
39826 
39827   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
39828   // legalization destroys the v4i32 type.
39829   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
39830     if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
39831       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
39832                       DAG.getBitcast(MVT::v4f32, V));
39833       return DAG.getZExtOrTrunc(V, DL, VT);
39834     }
39835   }
39836 
39837   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
39838   // movmskb even with avx512. This will be better than truncating to vXi1 and
39839   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
39840   // vpcmpeqb/vpcmpgtb.
39841   bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39842                       (Src.getOperand(0).getValueType() == MVT::v16i8 ||
39843                        Src.getOperand(0).getValueType() == MVT::v32i8 ||
39844                        Src.getOperand(0).getValueType() == MVT::v64i8);
39845 
39846   // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
39847   // directly with vpmovmskb/vmovmskps/vmovmskpd.
39848   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39849       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
39850       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
39851     EVT CmpVT = Src.getOperand(0).getValueType();
39852     EVT EltVT = CmpVT.getVectorElementType();
39853     if (CmpVT.getSizeInBits() <= 256 &&
39854         (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
39855       PreferMovMsk = true;
39856   }
39857 
39858   // With AVX512 vxi1 types are legal and we prefer using k-regs.
39859   // MOVMSK is supported in SSE2 or later.
39860   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
39861     return SDValue();
39862 
39863   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
39864   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
39865   // v8i16 and v16i16.
39866   // For these two cases, we can shuffle the upper element bytes to a
39867   // consecutive sequence at the start of the vector and treat the results as
39868   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
39869   // for v16i16 this is not the case, because the shuffle is expensive, so we
39870   // avoid sign-extending to this type entirely.
39871   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
39872   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
39873   MVT SExtVT;
39874   bool PropagateSExt = false;
39875   switch (SrcVT.getSimpleVT().SimpleTy) {
39876   default:
39877     return SDValue();
39878   case MVT::v2i1:
39879     SExtVT = MVT::v2i64;
39880     break;
39881   case MVT::v4i1:
39882     SExtVT = MVT::v4i32;
39883     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
39884     // sign-extend to a 256-bit operation to avoid truncation.
39885     if (Subtarget.hasAVX() &&
39886         checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
39887       SExtVT = MVT::v4i64;
39888       PropagateSExt = true;
39889     }
39890     break;
39891   case MVT::v8i1:
39892     SExtVT = MVT::v8i16;
39893     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
39894     // sign-extend to a 256-bit operation to match the compare.
39895     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
39896     // 256-bit because the shuffle is cheaper than sign extending the result of
39897     // the compare.
39898     if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
39899                                checkBitcastSrcVectorSize(Src, 512, true))) {
39900       SExtVT = MVT::v8i32;
39901       PropagateSExt = true;
39902     }
39903     break;
39904   case MVT::v16i1:
39905     SExtVT = MVT::v16i8;
39906     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
39907     // it is not profitable to sign-extend to 256-bit because this will
39908     // require an extra cross-lane shuffle which is more expensive than
39909     // truncating the result of the compare to 128-bits.
39910     break;
39911   case MVT::v32i1:
39912     SExtVT = MVT::v32i8;
39913     break;
39914   case MVT::v64i1:
39915     // If we have AVX512F, but not AVX512BW and the input is truncated from
39916     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
39917     if (Subtarget.hasAVX512()) {
39918       if (Subtarget.hasBWI())
39919         return SDValue();
39920       SExtVT = MVT::v64i8;
39921       break;
39922     }
39923     // Split if this is a <64 x i8> comparison result.
39924     if (checkBitcastSrcVectorSize(Src, 512, false)) {
39925       SExtVT = MVT::v64i8;
39926       break;
39927     }
39928     return SDValue();
39929   };
39930 
39931   SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
39932                             : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39933 
39934   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
39935     V = getPMOVMSKB(DL, V, DAG, Subtarget);
39936   } else {
39937     if (SExtVT == MVT::v8i16)
39938       V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
39939                       DAG.getUNDEF(MVT::v8i16));
39940     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
39941   }
39942 
39943   EVT IntVT =
39944       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
39945   V = DAG.getZExtOrTrunc(V, DL, IntVT);
39946   return DAG.getBitcast(VT, V);
39947 }
39948 
39949 // Convert a vXi1 constant build vector to the same width scalar integer.
39950 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
39951   EVT SrcVT = Op.getValueType();
39952   assert(SrcVT.getVectorElementType() == MVT::i1 &&
39953          "Expected a vXi1 vector");
39954   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
39955          "Expected a constant build vector");
39956 
39957   APInt Imm(SrcVT.getVectorNumElements(), 0);
39958   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
39959     SDValue In = Op.getOperand(Idx);
39960     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
39961       Imm.setBit(Idx);
39962   }
39963   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
39964   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
39965 }
39966 
39967 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
39968                                            TargetLowering::DAGCombinerInfo &DCI,
39969                                            const X86Subtarget &Subtarget) {
39970   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
39971 
39972   if (!DCI.isBeforeLegalizeOps())
39973     return SDValue();
39974 
39975   // Only do this if we have k-registers.
39976   if (!Subtarget.hasAVX512())
39977     return SDValue();
39978 
39979   EVT DstVT = N->getValueType(0);
39980   SDValue Op = N->getOperand(0);
39981   EVT SrcVT = Op.getValueType();
39982 
39983   if (!Op.hasOneUse())
39984     return SDValue();
39985 
39986   // Look for logic ops.
39987   if (Op.getOpcode() != ISD::AND &&
39988       Op.getOpcode() != ISD::OR &&
39989       Op.getOpcode() != ISD::XOR)
39990     return SDValue();
39991 
39992   // Make sure we have a bitcast between mask registers and a scalar type.
39993   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
39994         DstVT.isScalarInteger()) &&
39995       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
39996         SrcVT.isScalarInteger()))
39997     return SDValue();
39998 
39999   SDValue LHS = Op.getOperand(0);
40000   SDValue RHS = Op.getOperand(1);
40001 
40002   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
40003       LHS.getOperand(0).getValueType() == DstVT)
40004     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
40005                        DAG.getBitcast(DstVT, RHS));
40006 
40007   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
40008       RHS.getOperand(0).getValueType() == DstVT)
40009     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40010                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
40011 
40012   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
40013   // Most of these have to move a constant from the scalar domain anyway.
40014   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
40015     RHS = combinevXi1ConstantToInteger(RHS, DAG);
40016     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40017                        DAG.getBitcast(DstVT, LHS), RHS);
40018   }
40019 
40020   return SDValue();
40021 }
40022 
40023 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
40024                                     const X86Subtarget &Subtarget) {
40025   SDLoc DL(BV);
40026   unsigned NumElts = BV->getNumOperands();
40027   SDValue Splat = BV->getSplatValue();
40028 
40029   // Build MMX element from integer GPR or SSE float values.
40030   auto CreateMMXElement = [&](SDValue V) {
40031     if (V.isUndef())
40032       return DAG.getUNDEF(MVT::x86mmx);
40033     if (V.getValueType().isFloatingPoint()) {
40034       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
40035         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
40036         V = DAG.getBitcast(MVT::v2i64, V);
40037         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
40038       }
40039       V = DAG.getBitcast(MVT::i32, V);
40040     } else {
40041       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
40042     }
40043     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
40044   };
40045 
40046   // Convert build vector ops to MMX data in the bottom elements.
40047   SmallVector<SDValue, 8> Ops;
40048 
40049   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40050 
40051   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
40052   if (Splat) {
40053     if (Splat.isUndef())
40054       return DAG.getUNDEF(MVT::x86mmx);
40055 
40056     Splat = CreateMMXElement(Splat);
40057 
40058     if (Subtarget.hasSSE1()) {
40059       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
40060       if (NumElts == 8)
40061         Splat = DAG.getNode(
40062             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40063             DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
40064                                   TLI.getPointerTy(DAG.getDataLayout())),
40065             Splat, Splat);
40066 
40067       // Use PSHUFW to repeat 16-bit elements.
40068       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
40069       return DAG.getNode(
40070           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40071           DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
40072                                 TLI.getPointerTy(DAG.getDataLayout())),
40073           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
40074     }
40075     Ops.append(NumElts, Splat);
40076   } else {
40077     for (unsigned i = 0; i != NumElts; ++i)
40078       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
40079   }
40080 
40081   // Use tree of PUNPCKLs to build up general MMX vector.
40082   while (Ops.size() > 1) {
40083     unsigned NumOps = Ops.size();
40084     unsigned IntrinOp =
40085         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
40086                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
40087                                     : Intrinsic::x86_mmx_punpcklbw));
40088     SDValue Intrin = DAG.getTargetConstant(
40089         IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
40090     for (unsigned i = 0; i != NumOps; i += 2)
40091       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
40092                                Ops[i], Ops[i + 1]);
40093     Ops.resize(NumOps / 2);
40094   }
40095 
40096   return Ops[0];
40097 }
40098 
40099 // Recursive function that attempts to find if a bool vector node was originally
40100 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
40101 // integer. If so, replace the scalar ops with bool vector equivalents back down
40102 // the chain.
40103 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
40104                                           SelectionDAG &DAG,
40105                                           const X86Subtarget &Subtarget) {
40106   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40107   unsigned Opc = V.getOpcode();
40108   switch (Opc) {
40109   case ISD::BITCAST: {
40110     // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
40111     SDValue Src = V.getOperand(0);
40112     EVT SrcVT = Src.getValueType();
40113     if (SrcVT.isVector() || SrcVT.isFloatingPoint())
40114       return DAG.getBitcast(VT, Src);
40115     break;
40116   }
40117   case ISD::TRUNCATE: {
40118     // If we find a suitable source, a truncated scalar becomes a subvector.
40119     SDValue Src = V.getOperand(0);
40120     EVT NewSrcVT =
40121         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
40122     if (TLI.isTypeLegal(NewSrcVT))
40123       if (SDValue N0 =
40124               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40125         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
40126                            DAG.getIntPtrConstant(0, DL));
40127     break;
40128   }
40129   case ISD::ANY_EXTEND:
40130   case ISD::ZERO_EXTEND: {
40131     // If we find a suitable source, an extended scalar becomes a subvector.
40132     SDValue Src = V.getOperand(0);
40133     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
40134                                     Src.getScalarValueSizeInBits());
40135     if (TLI.isTypeLegal(NewSrcVT))
40136       if (SDValue N0 =
40137               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40138         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40139                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
40140                                                   : DAG.getConstant(0, DL, VT),
40141                            N0, DAG.getIntPtrConstant(0, DL));
40142     break;
40143   }
40144   case ISD::OR: {
40145     // If we find suitable sources, we can just move an OR to the vector domain.
40146     SDValue Src0 = V.getOperand(0);
40147     SDValue Src1 = V.getOperand(1);
40148     if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40149       if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
40150         return DAG.getNode(Opc, DL, VT, N0, N1);
40151     break;
40152   }
40153   case ISD::SHL: {
40154     // If we find a suitable source, a SHL becomes a KSHIFTL.
40155     SDValue Src0 = V.getOperand(0);
40156     if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
40157         ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
40158       break;
40159 
40160     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
40161       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40162         return DAG.getNode(
40163             X86ISD::KSHIFTL, DL, VT, N0,
40164             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
40165     break;
40166   }
40167   }
40168   return SDValue();
40169 }
40170 
40171 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
40172                               TargetLowering::DAGCombinerInfo &DCI,
40173                               const X86Subtarget &Subtarget) {
40174   SDValue N0 = N->getOperand(0);
40175   EVT VT = N->getValueType(0);
40176   EVT SrcVT = N0.getValueType();
40177   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40178 
40179   // Try to match patterns such as
40180   // (i16 bitcast (v16i1 x))
40181   // ->
40182   // (i16 movmsk (16i8 sext (v16i1 x)))
40183   // before the setcc result is scalarized on subtargets that don't have legal
40184   // vxi1 types.
40185   if (DCI.isBeforeLegalize()) {
40186     SDLoc dl(N);
40187     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
40188       return V;
40189 
40190     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40191     // type, widen both sides to avoid a trip through memory.
40192     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
40193         Subtarget.hasAVX512()) {
40194       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
40195       N0 = DAG.getBitcast(MVT::v8i1, N0);
40196       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
40197                          DAG.getIntPtrConstant(0, dl));
40198     }
40199 
40200     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40201     // type, widen both sides to avoid a trip through memory.
40202     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
40203         Subtarget.hasAVX512()) {
40204       // Use zeros for the widening if we already have some zeroes. This can
40205       // allow SimplifyDemandedBits to remove scalar ANDs that may be down
40206       // stream of this.
40207       // FIXME: It might make sense to detect a concat_vectors with a mix of
40208       // zeroes and undef and turn it into insert_subvector for i1 vectors as
40209       // a separate combine. What we can't do is canonicalize the operands of
40210       // such a concat or we'll get into a loop with SimplifyDemandedBits.
40211       if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
40212         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
40213         if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
40214           SrcVT = LastOp.getValueType();
40215           unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40216           SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
40217           Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
40218           N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40219           N0 = DAG.getBitcast(MVT::i8, N0);
40220           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40221         }
40222       }
40223 
40224       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40225       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
40226       Ops[0] = N0;
40227       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40228       N0 = DAG.getBitcast(MVT::i8, N0);
40229       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40230     }
40231   } else {
40232     // If we're bitcasting from iX to vXi1, see if the integer originally
40233     // began as a vXi1 and whether we can remove the bitcast entirely.
40234     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
40235         SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
40236       if (SDValue V =
40237               combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
40238         return V;
40239     }
40240   }
40241 
40242   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
40243   // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
40244   // due to insert_subvector legalization on KNL. By promoting the copy to i16
40245   // we can help with known bits propagation from the vXi1 domain to the
40246   // scalar domain.
40247   if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
40248       !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40249       N0.getOperand(0).getValueType() == MVT::v16i1 &&
40250       isNullConstant(N0.getOperand(1)))
40251     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
40252                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
40253 
40254   // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
40255   // and the vbroadcast_load are both integer or both fp. In some cases this
40256   // will remove the bitcast entirely.
40257   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40258        VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40259     auto *BCast = cast<MemIntrinsicSDNode>(N0);
40260     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40261     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40262     // Don't swap i8/i16 since don't have fp types that size.
40263     if (MemSize >= 32) {
40264       MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40265                                        : MVT::getIntegerVT(MemSize);
40266       MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40267                                         : MVT::getIntegerVT(SrcVTSize);
40268       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40269 
40270       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40271       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40272       SDValue ResNode =
40273           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40274                                   MemVT, BCast->getMemOperand());
40275       DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40276       return DAG.getBitcast(VT, ResNode);
40277     }
40278   }
40279 
40280   // Since MMX types are special and don't usually play with other vector types,
40281   // it's better to handle them early to be sure we emit efficient code by
40282   // avoiding store-load conversions.
40283   if (VT == MVT::x86mmx) {
40284     // Detect MMX constant vectors.
40285     APInt UndefElts;
40286     SmallVector<APInt, 1> EltBits;
40287     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40288       SDLoc DL(N0);
40289       // Handle zero-extension of i32 with MOVD.
40290       if (EltBits[0].countLeadingZeros() >= 32)
40291         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40292                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40293       // Else, bitcast to a double.
40294       // TODO - investigate supporting sext 32-bit immediates on x86_64.
40295       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40296       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40297     }
40298 
40299     // Detect bitcasts to x86mmx low word.
40300     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40301         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40302         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40303       bool LowUndef = true, AllUndefOrZero = true;
40304       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40305         SDValue Op = N0.getOperand(i);
40306         LowUndef &= Op.isUndef() || (i >= e/2);
40307         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40308       }
40309       if (AllUndefOrZero) {
40310         SDValue N00 = N0.getOperand(0);
40311         SDLoc dl(N00);
40312         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40313                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40314         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40315       }
40316     }
40317 
40318     // Detect bitcasts of 64-bit build vectors and convert to a
40319     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
40320     // lowest element.
40321     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40322         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
40323          SrcVT == MVT::v8i8))
40324       return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
40325 
40326     // Detect bitcasts between element or subvector extraction to x86mmx.
40327     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
40328          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
40329         isNullConstant(N0.getOperand(1))) {
40330       SDValue N00 = N0.getOperand(0);
40331       if (N00.getValueType().is128BitVector())
40332         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
40333                            DAG.getBitcast(MVT::v2i64, N00));
40334     }
40335 
40336     // Detect bitcasts from FP_TO_SINT to x86mmx.
40337     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
40338       SDLoc DL(N0);
40339       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
40340                                 DAG.getUNDEF(MVT::v2i32));
40341       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
40342                          DAG.getBitcast(MVT::v2i64, Res));
40343     }
40344   }
40345 
40346   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
40347   // most of these to scalar anyway.
40348   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
40349       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40350       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
40351     return combinevXi1ConstantToInteger(N0, DAG);
40352   }
40353 
40354   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40355       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40356       isa<ConstantSDNode>(N0)) {
40357     auto *C = cast<ConstantSDNode>(N0);
40358     if (C->isAllOnesValue())
40359       return DAG.getConstant(1, SDLoc(N0), VT);
40360     if (C->isNullValue())
40361       return DAG.getConstant(0, SDLoc(N0), VT);
40362   }
40363 
40364   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
40365   // Turn it into a sign bit compare that produces a k-register. This avoids
40366   // a trip through a GPR.
40367   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40368       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40369       isPowerOf2_32(VT.getVectorNumElements())) {
40370     unsigned NumElts = VT.getVectorNumElements();
40371     SDValue Src = N0;
40372 
40373     // Peek through truncate.
40374     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
40375       Src = N0.getOperand(0);
40376 
40377     if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
40378       SDValue MovmskIn = Src.getOperand(0);
40379       MVT MovmskVT = MovmskIn.getSimpleValueType();
40380       unsigned MovMskElts = MovmskVT.getVectorNumElements();
40381 
40382       // We allow extra bits of the movmsk to be used since they are known zero.
40383       // We can't convert a VPMOVMSKB without avx512bw.
40384       if (MovMskElts <= NumElts &&
40385           (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
40386         EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
40387         MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
40388         SDLoc dl(N);
40389         MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
40390         SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
40391                                    DAG.getConstant(0, dl, IntVT), ISD::SETLT);
40392         if (EVT(CmpVT) == VT)
40393           return Cmp;
40394 
40395         // Pad with zeroes up to original VT to replace the zeroes that were
40396         // being used from the MOVMSK.
40397         unsigned NumConcats = NumElts / MovMskElts;
40398         SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
40399         Ops[0] = Cmp;
40400         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
40401       }
40402     }
40403   }
40404 
40405   // Try to remove bitcasts from input and output of mask arithmetic to
40406   // remove GPR<->K-register crossings.
40407   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
40408     return V;
40409 
40410   // Convert a bitcasted integer logic operation that has one bitcasted
40411   // floating-point operand into a floating-point logic operation. This may
40412   // create a load of a constant, but that is cheaper than materializing the
40413   // constant in an integer register and transferring it to an SSE register or
40414   // transferring the SSE operand to integer register and back.
40415   unsigned FPOpcode;
40416   switch (N0.getOpcode()) {
40417     case ISD::AND: FPOpcode = X86ISD::FAND; break;
40418     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
40419     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40420     default: return SDValue();
40421   }
40422 
40423   // Check if we have a bitcast from another integer type as well.
40424   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
40425         (Subtarget.hasSSE2() && VT == MVT::f64) ||
40426         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
40427          TLI.isTypeLegal(VT))))
40428     return SDValue();
40429 
40430   SDValue LogicOp0 = N0.getOperand(0);
40431   SDValue LogicOp1 = N0.getOperand(1);
40432   SDLoc DL0(N0);
40433 
40434   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
40435   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
40436       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
40437       LogicOp0.getOperand(0).getValueType() == VT &&
40438       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
40439     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
40440     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40441     return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
40442   }
40443   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
40444   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
40445       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
40446       LogicOp1.getOperand(0).getValueType() == VT &&
40447       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
40448     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
40449     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40450     return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
40451   }
40452 
40453   return SDValue();
40454 }
40455 
40456 // Given a ABS node, detect the following pattern:
40457 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
40458 // This is useful as it is the input into a SAD pattern.
40459 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
40460   SDValue AbsOp1 = Abs->getOperand(0);
40461   if (AbsOp1.getOpcode() != ISD::SUB)
40462     return false;
40463 
40464   Op0 = AbsOp1.getOperand(0);
40465   Op1 = AbsOp1.getOperand(1);
40466 
40467   // Check if the operands of the sub are zero-extended from vectors of i8.
40468   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
40469       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
40470       Op1.getOpcode() != ISD::ZERO_EXTEND ||
40471       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
40472     return false;
40473 
40474   return true;
40475 }
40476 
40477 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
40478 // to these zexts.
40479 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
40480                             const SDValue &Zext1, const SDLoc &DL,
40481                             const X86Subtarget &Subtarget) {
40482   // Find the appropriate width for the PSADBW.
40483   EVT InVT = Zext0.getOperand(0).getValueType();
40484   unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
40485 
40486   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
40487   // fill in the missing vector elements with 0.
40488   unsigned NumConcat = RegSize / InVT.getSizeInBits();
40489   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
40490   Ops[0] = Zext0.getOperand(0);
40491   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
40492   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40493   Ops[0] = Zext1.getOperand(0);
40494   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40495 
40496   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
40497   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40498                           ArrayRef<SDValue> Ops) {
40499     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
40500     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
40501   };
40502   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
40503   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
40504                           PSADBWBuilder);
40505 }
40506 
40507 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
40508 // PHMINPOSUW.
40509 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
40510                                       const X86Subtarget &Subtarget) {
40511   // Bail without SSE41.
40512   if (!Subtarget.hasSSE41())
40513     return SDValue();
40514 
40515   EVT ExtractVT = Extract->getValueType(0);
40516   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
40517     return SDValue();
40518 
40519   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
40520   ISD::NodeType BinOp;
40521   SDValue Src = DAG.matchBinOpReduction(
40522       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
40523   if (!Src)
40524     return SDValue();
40525 
40526   EVT SrcVT = Src.getValueType();
40527   EVT SrcSVT = SrcVT.getScalarType();
40528   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
40529     return SDValue();
40530 
40531   SDLoc DL(Extract);
40532   SDValue MinPos = Src;
40533 
40534   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
40535   while (SrcVT.getSizeInBits() > 128) {
40536     SDValue Lo, Hi;
40537     std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
40538     SrcVT = Lo.getValueType();
40539     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
40540   }
40541   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
40542           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
40543          "Unexpected value type");
40544 
40545   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
40546   // to flip the value accordingly.
40547   SDValue Mask;
40548   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
40549   if (BinOp == ISD::SMAX)
40550     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
40551   else if (BinOp == ISD::SMIN)
40552     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
40553   else if (BinOp == ISD::UMAX)
40554     Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
40555 
40556   if (Mask)
40557     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40558 
40559   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
40560   // shuffling each upper element down and insert zeros. This means that the
40561   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
40562   // ready for the PHMINPOS.
40563   if (ExtractVT == MVT::i8) {
40564     SDValue Upper = DAG.getVectorShuffle(
40565         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
40566         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
40567     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
40568   }
40569 
40570   // Perform the PHMINPOS on a v8i16 vector,
40571   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
40572   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
40573   MinPos = DAG.getBitcast(SrcVT, MinPos);
40574 
40575   if (Mask)
40576     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40577 
40578   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
40579                      DAG.getIntPtrConstant(0, DL));
40580 }
40581 
40582 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
40583 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
40584                                          const X86Subtarget &Subtarget) {
40585   // Bail without SSE2.
40586   if (!Subtarget.hasSSE2())
40587     return SDValue();
40588 
40589   EVT ExtractVT = Extract->getValueType(0);
40590   unsigned BitWidth = ExtractVT.getSizeInBits();
40591   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
40592       ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
40593     return SDValue();
40594 
40595   // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
40596   ISD::NodeType BinOp;
40597   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
40598   if (!Match && ExtractVT == MVT::i1)
40599     Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
40600   if (!Match)
40601     return SDValue();
40602 
40603   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
40604   // which we can't support here for now.
40605   if (Match.getScalarValueSizeInBits() != BitWidth)
40606     return SDValue();
40607 
40608   SDValue Movmsk;
40609   SDLoc DL(Extract);
40610   EVT MatchVT = Match.getValueType();
40611   unsigned NumElts = MatchVT.getVectorNumElements();
40612   unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
40613   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40614 
40615   if (ExtractVT == MVT::i1) {
40616     // Special case for (pre-legalization) vXi1 reductions.
40617     if (NumElts > 64 || !isPowerOf2_32(NumElts))
40618       return SDValue();
40619     if (TLI.isTypeLegal(MatchVT)) {
40620       // If this is a legal AVX512 predicate type then we can just bitcast.
40621       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40622       Movmsk = DAG.getBitcast(MovmskVT, Match);
40623     } else {
40624       // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
40625       // PCMPEQQ (SSE41+), use PCMPEQD instead.
40626       if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
40627           Match.getOpcode() == ISD::SETCC &&
40628           ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
40629           cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
40630               ISD::CondCode::SETEQ) {
40631         SDValue Vec = Match.getOperand(0);
40632         if (Vec.getValueType().getScalarType() == MVT::i64 &&
40633             (2 * NumElts) <= MaxElts) {
40634           NumElts *= 2;
40635           EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
40636           MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
40637           Match = DAG.getSetCC(
40638               DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
40639               DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
40640         }
40641       }
40642 
40643       // Use combineBitcastvxi1 to create the MOVMSK.
40644       while (NumElts > MaxElts) {
40645         SDValue Lo, Hi;
40646         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40647         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40648         NumElts /= 2;
40649       }
40650       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40651       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
40652     }
40653     if (!Movmsk)
40654       return SDValue();
40655     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
40656   } else {
40657     // FIXME: Better handling of k-registers or 512-bit vectors?
40658     unsigned MatchSizeInBits = Match.getValueSizeInBits();
40659     if (!(MatchSizeInBits == 128 ||
40660           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
40661       return SDValue();
40662 
40663     // Make sure this isn't a vector of 1 element. The perf win from using
40664     // MOVMSK diminishes with less elements in the reduction, but it is
40665     // generally better to get the comparison over to the GPRs as soon as
40666     // possible to reduce the number of vector ops.
40667     if (Match.getValueType().getVectorNumElements() < 2)
40668       return SDValue();
40669 
40670     // Check that we are extracting a reduction of all sign bits.
40671     if (DAG.ComputeNumSignBits(Match) != BitWidth)
40672       return SDValue();
40673 
40674     if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
40675       SDValue Lo, Hi;
40676       std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40677       Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40678       MatchSizeInBits = Match.getValueSizeInBits();
40679     }
40680 
40681     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
40682     MVT MaskSrcVT;
40683     if (64 == BitWidth || 32 == BitWidth)
40684       MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
40685                                    MatchSizeInBits / BitWidth);
40686     else
40687       MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
40688 
40689     SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
40690     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
40691     NumElts = MaskSrcVT.getVectorNumElements();
40692   }
40693   assert((NumElts <= 32 || NumElts == 64) &&
40694          "Not expecting more than 64 elements");
40695 
40696   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
40697   if (BinOp == ISD::XOR) {
40698     // parity -> (PARITY(MOVMSK X))
40699     SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
40700     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
40701   }
40702 
40703   SDValue CmpC;
40704   ISD::CondCode CondCode;
40705   if (BinOp == ISD::OR) {
40706     // any_of -> MOVMSK != 0
40707     CmpC = DAG.getConstant(0, DL, CmpVT);
40708     CondCode = ISD::CondCode::SETNE;
40709   } else {
40710     // all_of -> MOVMSK == ((1 << NumElts) - 1)
40711     CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
40712                            DL, CmpVT);
40713     CondCode = ISD::CondCode::SETEQ;
40714   }
40715 
40716   // The setcc produces an i8 of 0/1, so extend that to the result width and
40717   // negate to get the final 0/-1 mask value.
40718   EVT SetccVT =
40719       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
40720   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
40721   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
40722   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
40723   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
40724 }
40725 
40726 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
40727                                       const X86Subtarget &Subtarget) {
40728   // PSADBW is only supported on SSE2 and up.
40729   if (!Subtarget.hasSSE2())
40730     return SDValue();
40731 
40732   EVT ExtractVT = Extract->getValueType(0);
40733   // Verify the type we're extracting is either i32 or i64.
40734   // FIXME: Could support other types, but this is what we have coverage for.
40735   if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
40736     return SDValue();
40737 
40738   EVT VT = Extract->getOperand(0).getValueType();
40739   if (!isPowerOf2_32(VT.getVectorNumElements()))
40740     return SDValue();
40741 
40742   // Match shuffle + add pyramid.
40743   ISD::NodeType BinOp;
40744   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
40745 
40746   // The operand is expected to be zero extended from i8
40747   // (verified in detectZextAbsDiff).
40748   // In order to convert to i64 and above, additional any/zero/sign
40749   // extend is expected.
40750   // The zero extend from 32 bit has no mathematical effect on the result.
40751   // Also the sign extend is basically zero extend
40752   // (extends the sign bit which is zero).
40753   // So it is correct to skip the sign/zero extend instruction.
40754   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
40755                Root.getOpcode() == ISD::ZERO_EXTEND ||
40756                Root.getOpcode() == ISD::ANY_EXTEND))
40757     Root = Root.getOperand(0);
40758 
40759   // If there was a match, we want Root to be a select that is the root of an
40760   // abs-diff pattern.
40761   if (!Root || Root.getOpcode() != ISD::ABS)
40762     return SDValue();
40763 
40764   // Check whether we have an abs-diff pattern feeding into the select.
40765   SDValue Zext0, Zext1;
40766   if (!detectZextAbsDiff(Root, Zext0, Zext1))
40767     return SDValue();
40768 
40769   // Create the SAD instruction.
40770   SDLoc DL(Extract);
40771   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
40772 
40773   // If the original vector was wider than 8 elements, sum over the results
40774   // in the SAD vector.
40775   unsigned Stages = Log2_32(VT.getVectorNumElements());
40776   EVT SadVT = SAD.getValueType();
40777   if (Stages > 3) {
40778     unsigned SadElems = SadVT.getVectorNumElements();
40779 
40780     for(unsigned i = Stages - 3; i > 0; --i) {
40781       SmallVector<int, 16> Mask(SadElems, -1);
40782       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
40783         Mask[j] = MaskEnd + j;
40784 
40785       SDValue Shuffle =
40786           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
40787       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
40788     }
40789   }
40790 
40791   unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
40792   // Return the lowest ExtractSizeInBits bits.
40793   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
40794                                SadVT.getSizeInBits() / ExtractSizeInBits);
40795   SAD = DAG.getBitcast(ResVT, SAD);
40796   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
40797                      Extract->getOperand(1));
40798 }
40799 
40800 // Attempt to peek through a target shuffle and extract the scalar from the
40801 // source.
40802 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
40803                                          TargetLowering::DAGCombinerInfo &DCI,
40804                                          const X86Subtarget &Subtarget) {
40805   if (DCI.isBeforeLegalizeOps())
40806     return SDValue();
40807 
40808   SDLoc dl(N);
40809   SDValue Src = N->getOperand(0);
40810   SDValue Idx = N->getOperand(1);
40811 
40812   EVT VT = N->getValueType(0);
40813   EVT SrcVT = Src.getValueType();
40814   EVT SrcSVT = SrcVT.getVectorElementType();
40815   unsigned SrcEltBits = SrcSVT.getSizeInBits();
40816   unsigned NumSrcElts = SrcVT.getVectorNumElements();
40817 
40818   // Don't attempt this for boolean mask vectors or unknown extraction indices.
40819   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
40820     return SDValue();
40821 
40822   const APInt &IdxC = N->getConstantOperandAPInt(1);
40823   if (IdxC.uge(NumSrcElts))
40824     return SDValue();
40825 
40826   SDValue SrcBC = peekThroughBitcasts(Src);
40827 
40828   // Handle extract(bitcast(broadcast(scalar_value))).
40829   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
40830     SDValue SrcOp = SrcBC.getOperand(0);
40831     EVT SrcOpVT = SrcOp.getValueType();
40832     if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
40833         (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
40834       unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
40835       unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
40836       // TODO support non-zero offsets.
40837       if (Offset == 0) {
40838         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
40839         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
40840         return SrcOp;
40841       }
40842     }
40843   }
40844 
40845   // If we're extracting a single element from a broadcast load and there are
40846   // no other users, just create a single load.
40847   if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
40848     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
40849     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
40850     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
40851         VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
40852       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
40853                                  MemIntr->getBasePtr(),
40854                                  MemIntr->getPointerInfo(),
40855                                  MemIntr->getOriginalAlign(),
40856                                  MemIntr->getMemOperand()->getFlags());
40857       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40858       return Load;
40859     }
40860   }
40861 
40862   // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
40863   // TODO: Move to DAGCombine?
40864   if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
40865       SrcBC.getValueType().isInteger() &&
40866       (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
40867       SrcBC.getScalarValueSizeInBits() ==
40868           SrcBC.getOperand(0).getValueSizeInBits()) {
40869     unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
40870     if (IdxC.ult(Scale)) {
40871       unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
40872       SDValue Scl = SrcBC.getOperand(0);
40873       EVT SclVT = Scl.getValueType();
40874       if (Offset) {
40875         Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
40876                           DAG.getShiftAmountConstant(Offset, SclVT, dl));
40877       }
40878       Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
40879       Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
40880       return Scl;
40881     }
40882   }
40883 
40884   // Handle extract(truncate(x)) for 0'th index.
40885   // TODO: Treat this as a faux shuffle?
40886   // TODO: When can we use this for general indices?
40887   if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
40888       (SrcVT.getSizeInBits() % 128) == 0) {
40889     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
40890     MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
40891     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
40892                        Idx);
40893   }
40894 
40895   // We can only legally extract other elements from 128-bit vectors and in
40896   // certain circumstances, depending on SSE-level.
40897   // TODO: Investigate float/double extraction if it will be just stored.
40898   auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
40899                                                  unsigned Idx) {
40900     EVT VecSVT = VecVT.getScalarType();
40901     if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40902         (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40903          VecSVT == MVT::i64)) {
40904       unsigned EltSizeInBits = VecSVT.getSizeInBits();
40905       unsigned NumEltsPerLane = 128 / EltSizeInBits;
40906       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40907       unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40908       VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40909       Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40910       Idx &= (NumEltsPerLane - 1);
40911     }
40912     if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
40913         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
40914       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
40915                          DAG.getBitcast(VecVT, Vec),
40916                          DAG.getIntPtrConstant(Idx, dl));
40917     }
40918     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
40919         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
40920       unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
40921       return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
40922                          DAG.getTargetConstant(Idx, dl, MVT::i8));
40923     }
40924     return SDValue();
40925   };
40926 
40927   // Resolve the target shuffle inputs and mask.
40928   SmallVector<int, 16> Mask;
40929   SmallVector<SDValue, 2> Ops;
40930   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
40931     return SDValue();
40932 
40933   // Shuffle inputs must be the same size as the result.
40934   if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
40935         return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
40936       }))
40937     return SDValue();
40938 
40939   // Attempt to narrow/widen the shuffle mask to the correct size.
40940   if (Mask.size() != NumSrcElts) {
40941     if ((NumSrcElts % Mask.size()) == 0) {
40942       SmallVector<int, 16> ScaledMask;
40943       int Scale = NumSrcElts / Mask.size();
40944       narrowShuffleMaskElts(Scale, Mask, ScaledMask);
40945       Mask = std::move(ScaledMask);
40946     } else if ((Mask.size() % NumSrcElts) == 0) {
40947       // Simplify Mask based on demanded element.
40948       int ExtractIdx = (int)IdxC.getZExtValue();
40949       int Scale = Mask.size() / NumSrcElts;
40950       int Lo = Scale * ExtractIdx;
40951       int Hi = Scale * (ExtractIdx + 1);
40952       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
40953         if (i < Lo || Hi <= i)
40954           Mask[i] = SM_SentinelUndef;
40955 
40956       SmallVector<int, 16> WidenedMask;
40957       while (Mask.size() > NumSrcElts &&
40958              canWidenShuffleElements(Mask, WidenedMask))
40959         Mask = std::move(WidenedMask);
40960     }
40961   }
40962 
40963   // If narrowing/widening failed, see if we can extract+zero-extend.
40964   int ExtractIdx;
40965   EVT ExtractVT;
40966   if (Mask.size() == NumSrcElts) {
40967     ExtractIdx = Mask[IdxC.getZExtValue()];
40968     ExtractVT = SrcVT;
40969   } else {
40970     unsigned Scale = Mask.size() / NumSrcElts;
40971     if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
40972       return SDValue();
40973     unsigned ScaledIdx = Scale * IdxC.getZExtValue();
40974     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
40975       return SDValue();
40976     ExtractIdx = Mask[ScaledIdx];
40977     EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
40978     ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
40979     assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
40980            "Failed to widen vector type");
40981   }
40982 
40983   // If the shuffle source element is undef/zero then we can just accept it.
40984   if (ExtractIdx == SM_SentinelUndef)
40985     return DAG.getUNDEF(VT);
40986 
40987   if (ExtractIdx == SM_SentinelZero)
40988     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
40989                                 : DAG.getConstant(0, dl, VT);
40990 
40991   SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
40992   ExtractIdx = ExtractIdx % Mask.size();
40993   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
40994     return DAG.getZExtOrTrunc(V, dl, VT);
40995 
40996   return SDValue();
40997 }
40998 
40999 /// Extracting a scalar FP value from vector element 0 is free, so extract each
41000 /// operand first, then perform the math as a scalar op.
41001 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
41002   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
41003   SDValue Vec = ExtElt->getOperand(0);
41004   SDValue Index = ExtElt->getOperand(1);
41005   EVT VT = ExtElt->getValueType(0);
41006   EVT VecVT = Vec.getValueType();
41007 
41008   // TODO: If this is a unary/expensive/expand op, allow extraction from a
41009   // non-zero element because the shuffle+scalar op will be cheaper?
41010   if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
41011     return SDValue();
41012 
41013   // Vector FP compares don't fit the pattern of FP math ops (propagate, not
41014   // extract, the condition code), so deal with those as a special-case.
41015   if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
41016     EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
41017     if (OpVT != MVT::f32 && OpVT != MVT::f64)
41018       return SDValue();
41019 
41020     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
41021     SDLoc DL(ExtElt);
41022     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41023                                Vec.getOperand(0), Index);
41024     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41025                                Vec.getOperand(1), Index);
41026     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
41027   }
41028 
41029   if (VT != MVT::f32 && VT != MVT::f64)
41030     return SDValue();
41031 
41032   // Vector FP selects don't fit the pattern of FP math ops (because the
41033   // condition has a different type and we have to change the opcode), so deal
41034   // with those here.
41035   // FIXME: This is restricted to pre type legalization by ensuring the setcc
41036   // has i1 elements. If we loosen this we need to convert vector bool to a
41037   // scalar bool.
41038   if (Vec.getOpcode() == ISD::VSELECT &&
41039       Vec.getOperand(0).getOpcode() == ISD::SETCC &&
41040       Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
41041       Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
41042     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
41043     SDLoc DL(ExtElt);
41044     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
41045                                Vec.getOperand(0).getValueType().getScalarType(),
41046                                Vec.getOperand(0), Index);
41047     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41048                                Vec.getOperand(1), Index);
41049     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41050                                Vec.getOperand(2), Index);
41051     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
41052   }
41053 
41054   // TODO: This switch could include FNEG and the x86-specific FP logic ops
41055   // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
41056   // missed load folding and fma+fneg combining.
41057   switch (Vec.getOpcode()) {
41058   case ISD::FMA: // Begin 3 operands
41059   case ISD::FMAD:
41060   case ISD::FADD: // Begin 2 operands
41061   case ISD::FSUB:
41062   case ISD::FMUL:
41063   case ISD::FDIV:
41064   case ISD::FREM:
41065   case ISD::FCOPYSIGN:
41066   case ISD::FMINNUM:
41067   case ISD::FMAXNUM:
41068   case ISD::FMINNUM_IEEE:
41069   case ISD::FMAXNUM_IEEE:
41070   case ISD::FMAXIMUM:
41071   case ISD::FMINIMUM:
41072   case X86ISD::FMAX:
41073   case X86ISD::FMIN:
41074   case ISD::FABS: // Begin 1 operand
41075   case ISD::FSQRT:
41076   case ISD::FRINT:
41077   case ISD::FCEIL:
41078   case ISD::FTRUNC:
41079   case ISD::FNEARBYINT:
41080   case ISD::FROUND:
41081   case ISD::FFLOOR:
41082   case X86ISD::FRCP:
41083   case X86ISD::FRSQRT: {
41084     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
41085     SDLoc DL(ExtElt);
41086     SmallVector<SDValue, 4> ExtOps;
41087     for (SDValue Op : Vec->ops())
41088       ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
41089     return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
41090   }
41091   default:
41092     return SDValue();
41093   }
41094   llvm_unreachable("All opcodes should return within switch");
41095 }
41096 
41097 /// Try to convert a vector reduction sequence composed of binops and shuffles
41098 /// into horizontal ops.
41099 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
41100                                      const X86Subtarget &Subtarget) {
41101   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
41102 
41103   // We need at least SSE2 to anything here.
41104   if (!Subtarget.hasSSE2())
41105     return SDValue();
41106 
41107   ISD::NodeType Opc;
41108   SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
41109                                         {ISD::ADD, ISD::MUL, ISD::FADD}, true);
41110   if (!Rdx)
41111     return SDValue();
41112 
41113   SDValue Index = ExtElt->getOperand(1);
41114   assert(isNullConstant(Index) &&
41115          "Reduction doesn't end in an extract from index 0");
41116 
41117   EVT VT = ExtElt->getValueType(0);
41118   EVT VecVT = Rdx.getValueType();
41119   if (VecVT.getScalarType() != VT)
41120     return SDValue();
41121 
41122   SDLoc DL(ExtElt);
41123 
41124   // vXi8 mul reduction - promote to vXi16 mul reduction.
41125   if (Opc == ISD::MUL) {
41126     unsigned NumElts = VecVT.getVectorNumElements();
41127     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
41128       return SDValue();
41129     if (VecVT.getSizeInBits() >= 128) {
41130       EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
41131       SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41132       SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41133       Lo = DAG.getBitcast(WideVT, Lo);
41134       Hi = DAG.getBitcast(WideVT, Hi);
41135       Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
41136       while (Rdx.getValueSizeInBits() > 128) {
41137         std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41138         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
41139       }
41140     } else {
41141       if (VecVT == MVT::v4i8)
41142         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41143                           DAG.getUNDEF(MVT::v4i8));
41144       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41145                         DAG.getUNDEF(MVT::v8i8));
41146       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
41147       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
41148     }
41149     if (NumElts >= 8)
41150       Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41151                         DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41152                                              {4, 5, 6, 7, -1, -1, -1, -1}));
41153     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41154                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41155                                            {2, 3, -1, -1, -1, -1, -1, -1}));
41156     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41157                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41158                                            {1, -1, -1, -1, -1, -1, -1, -1}));
41159     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41160     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41161   }
41162 
41163   // vXi8 add reduction - sub 128-bit vector.
41164   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
41165     if (VecVT == MVT::v4i8) {
41166       // Pad with zero.
41167       if (Subtarget.hasSSE41()) {
41168         Rdx = DAG.getBitcast(MVT::i32, Rdx);
41169         Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
41170                           DAG.getConstant(0, DL, MVT::v4i32), Rdx,
41171                           DAG.getIntPtrConstant(0, DL));
41172         Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41173       } else {
41174         Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41175                           DAG.getConstant(0, DL, VecVT));
41176       }
41177     }
41178     if (Rdx.getValueType() == MVT::v8i8) {
41179       // Pad with undef.
41180       Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41181                         DAG.getUNDEF(MVT::v8i8));
41182     }
41183     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41184                       DAG.getConstant(0, DL, MVT::v16i8));
41185     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41186     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41187   }
41188 
41189   // Must be a >=128-bit vector with pow2 elements.
41190   if ((VecVT.getSizeInBits() % 128) != 0 ||
41191       !isPowerOf2_32(VecVT.getVectorNumElements()))
41192     return SDValue();
41193 
41194   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
41195   if (VT == MVT::i8) {
41196     while (Rdx.getValueSizeInBits() > 128) {
41197       SDValue Lo, Hi;
41198       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41199       VecVT = Lo.getValueType();
41200       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
41201     }
41202     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
41203 
41204     SDValue Hi = DAG.getVectorShuffle(
41205         MVT::v16i8, DL, Rdx, Rdx,
41206         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
41207     Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
41208     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41209                       getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
41210     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41211     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41212   }
41213 
41214   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
41215   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
41216     return SDValue();
41217 
41218   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
41219 
41220   // 256-bit horizontal instructions operate on 128-bit chunks rather than
41221   // across the whole vector, so we need an extract + hop preliminary stage.
41222   // This is the only step where the operands of the hop are not the same value.
41223   // TODO: We could extend this to handle 512-bit or even longer vectors.
41224   if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
41225       ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
41226     unsigned NumElts = VecVT.getVectorNumElements();
41227     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
41228     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
41229     Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
41230     VecVT = Rdx.getValueType();
41231   }
41232   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
41233       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
41234     return SDValue();
41235 
41236   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
41237   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
41238   for (unsigned i = 0; i != ReductionSteps; ++i)
41239     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
41240 
41241   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41242 }
41243 
41244 /// Detect vector gather/scatter index generation and convert it from being a
41245 /// bunch of shuffles and extracts into a somewhat faster sequence.
41246 /// For i686, the best sequence is apparently storing the value and loading
41247 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
41248 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
41249                                        TargetLowering::DAGCombinerInfo &DCI,
41250                                        const X86Subtarget &Subtarget) {
41251   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
41252     return NewOp;
41253 
41254   SDValue InputVector = N->getOperand(0);
41255   SDValue EltIdx = N->getOperand(1);
41256   auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41257 
41258   EVT SrcVT = InputVector.getValueType();
41259   EVT VT = N->getValueType(0);
41260   SDLoc dl(InputVector);
41261   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41262   unsigned NumSrcElts = SrcVT.getVectorNumElements();
41263 
41264   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41265     return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41266 
41267   // Integer Constant Folding.
41268   if (CIdx && VT.isInteger()) {
41269     APInt UndefVecElts;
41270     SmallVector<APInt, 16> EltBits;
41271     unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41272     if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41273                                       EltBits, true, false)) {
41274       uint64_t Idx = CIdx->getZExtValue();
41275       if (UndefVecElts[Idx])
41276         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41277       return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41278                              dl, VT);
41279     }
41280   }
41281 
41282   if (IsPextr) {
41283     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41284     if (TLI.SimplifyDemandedBits(
41285             SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41286       return SDValue(N, 0);
41287 
41288     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41289     if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41290          InputVector.getOpcode() == X86ISD::PINSRW) &&
41291         InputVector.getOperand(2) == EltIdx) {
41292       assert(SrcVT == InputVector.getOperand(0).getValueType() &&
41293              "Vector type mismatch");
41294       SDValue Scl = InputVector.getOperand(1);
41295       Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41296       return DAG.getZExtOrTrunc(Scl, dl, VT);
41297     }
41298 
41299     // TODO - Remove this once we can handle the implicit zero-extension of
41300     // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41301     // combineBasicSADPattern.
41302     return SDValue();
41303   }
41304 
41305   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41306   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41307       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41308     SDValue MMXSrc = InputVector.getOperand(0);
41309 
41310     // The bitcast source is a direct mmx result.
41311     if (MMXSrc.getValueType() == MVT::x86mmx)
41312       return DAG.getBitcast(VT, InputVector);
41313   }
41314 
41315   // Detect mmx to i32 conversion through a v2i32 elt extract.
41316   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41317       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
41318     SDValue MMXSrc = InputVector.getOperand(0);
41319 
41320     // The bitcast source is a direct mmx result.
41321     if (MMXSrc.getValueType() == MVT::x86mmx)
41322       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
41323   }
41324 
41325   // Check whether this extract is the root of a sum of absolute differences
41326   // pattern. This has to be done here because we really want it to happen
41327   // pre-legalization,
41328   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
41329     return SAD;
41330 
41331   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
41332   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
41333     return Cmp;
41334 
41335   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
41336   if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
41337     return MinMax;
41338 
41339   // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
41340   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
41341     return V;
41342 
41343   if (SDValue V = scalarizeExtEltFP(N, DAG))
41344     return V;
41345 
41346   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
41347   // and then testing the relevant element.
41348   //
41349   // Note that we only combine extracts on the *same* result number, i.e.
41350   //   t0 = merge_values a0, a1, a2, a3
41351   //   i1 = extract_vector_elt t0, Constant:i64<2>
41352   //   i1 = extract_vector_elt t0, Constant:i64<3>
41353   // but not
41354   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
41355   // since the latter would need its own MOVMSK.
41356   if (CIdx && SrcVT.getScalarType() == MVT::i1) {
41357     SmallVector<SDNode *, 16> BoolExtracts;
41358     unsigned ResNo = InputVector.getResNo();
41359     auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
41360       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41361           isa<ConstantSDNode>(Use->getOperand(1)) &&
41362           Use->getOperand(0).getResNo() == ResNo &&
41363           Use->getValueType(0) == MVT::i1) {
41364         BoolExtracts.push_back(Use);
41365         return true;
41366       }
41367       return false;
41368     };
41369     if (all_of(InputVector->uses(), IsBoolExtract) &&
41370         BoolExtracts.size() > 1) {
41371       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
41372       if (SDValue BC =
41373               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
41374         for (SDNode *Use : BoolExtracts) {
41375           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
41376           unsigned MaskIdx = Use->getConstantOperandVal(1);
41377           APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
41378           SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
41379           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
41380           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
41381           DCI.CombineTo(Use, Res);
41382         }
41383         return SDValue(N, 0);
41384       }
41385     }
41386   }
41387 
41388   return SDValue();
41389 }
41390 
41391 /// If a vector select has an operand that is -1 or 0, try to simplify the
41392 /// select to a bitwise logic operation.
41393 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
41394 static SDValue
41395 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
41396                                  TargetLowering::DAGCombinerInfo &DCI,
41397                                  const X86Subtarget &Subtarget) {
41398   SDValue Cond = N->getOperand(0);
41399   SDValue LHS = N->getOperand(1);
41400   SDValue RHS = N->getOperand(2);
41401   EVT VT = LHS.getValueType();
41402   EVT CondVT = Cond.getValueType();
41403   SDLoc DL(N);
41404   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41405 
41406   if (N->getOpcode() != ISD::VSELECT)
41407     return SDValue();
41408 
41409   assert(CondVT.isVector() && "Vector select expects a vector selector!");
41410 
41411   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
41412   // TODO: Can we assert that both operands are not zeros (because that should
41413   //       get simplified at node creation time)?
41414   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
41415   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
41416 
41417   // If both inputs are 0/undef, create a complete zero vector.
41418   // FIXME: As noted above this should be handled by DAGCombiner/getNode.
41419   if (TValIsAllZeros && FValIsAllZeros) {
41420     if (VT.isFloatingPoint())
41421       return DAG.getConstantFP(0.0, DL, VT);
41422     return DAG.getConstant(0, DL, VT);
41423   }
41424 
41425   // To use the condition operand as a bitwise mask, it must have elements that
41426   // are the same size as the select elements. Ie, the condition operand must
41427   // have already been promoted from the IR select condition type <N x i1>.
41428   // Don't check if the types themselves are equal because that excludes
41429   // vector floating-point selects.
41430   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
41431     return SDValue();
41432 
41433   // Try to invert the condition if true value is not all 1s and false value is
41434   // not all 0s. Only do this if the condition has one use.
41435   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
41436   if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
41437       // Check if the selector will be produced by CMPP*/PCMP*.
41438       Cond.getOpcode() == ISD::SETCC &&
41439       // Check if SETCC has already been promoted.
41440       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
41441           CondVT) {
41442     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
41443 
41444     if (TValIsAllZeros || FValIsAllOnes) {
41445       SDValue CC = Cond.getOperand(2);
41446       ISD::CondCode NewCC = ISD::getSetCCInverse(
41447           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
41448       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
41449                           NewCC);
41450       std::swap(LHS, RHS);
41451       TValIsAllOnes = FValIsAllOnes;
41452       FValIsAllZeros = TValIsAllZeros;
41453     }
41454   }
41455 
41456   // Cond value must be 'sign splat' to be converted to a logical op.
41457   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
41458     return SDValue();
41459 
41460   // vselect Cond, 111..., 000... -> Cond
41461   if (TValIsAllOnes && FValIsAllZeros)
41462     return DAG.getBitcast(VT, Cond);
41463 
41464   if (!TLI.isTypeLegal(CondVT))
41465     return SDValue();
41466 
41467   // vselect Cond, 111..., X -> or Cond, X
41468   if (TValIsAllOnes) {
41469     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41470     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
41471     return DAG.getBitcast(VT, Or);
41472   }
41473 
41474   // vselect Cond, X, 000... -> and Cond, X
41475   if (FValIsAllZeros) {
41476     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
41477     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
41478     return DAG.getBitcast(VT, And);
41479   }
41480 
41481   // vselect Cond, 000..., X -> andn Cond, X
41482   if (TValIsAllZeros) {
41483     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41484     SDValue AndN;
41485     // The canonical form differs for i1 vectors - x86andnp is not used
41486     if (CondVT.getScalarType() == MVT::i1)
41487       AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
41488                          CastRHS);
41489     else
41490       AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
41491     return DAG.getBitcast(VT, AndN);
41492   }
41493 
41494   return SDValue();
41495 }
41496 
41497 /// If both arms of a vector select are concatenated vectors, split the select,
41498 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
41499 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
41500 ///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
41501 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
41502                                   const X86Subtarget &Subtarget) {
41503   unsigned Opcode = N->getOpcode();
41504   if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
41505     return SDValue();
41506 
41507   // TODO: Split 512-bit vectors too?
41508   EVT VT = N->getValueType(0);
41509   if (!VT.is256BitVector())
41510     return SDValue();
41511 
41512   // TODO: Split as long as any 2 of the 3 operands are concatenated?
41513   SDValue Cond = N->getOperand(0);
41514   SDValue TVal = N->getOperand(1);
41515   SDValue FVal = N->getOperand(2);
41516   SmallVector<SDValue, 4> CatOpsT, CatOpsF;
41517   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
41518       !collectConcatOps(TVal.getNode(), CatOpsT) ||
41519       !collectConcatOps(FVal.getNode(), CatOpsF))
41520     return SDValue();
41521 
41522   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
41523                             ArrayRef<SDValue> Ops) {
41524     return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
41525   };
41526   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
41527                           makeBlend, /*CheckBWI*/ false);
41528 }
41529 
41530 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
41531   SDValue Cond = N->getOperand(0);
41532   SDValue LHS = N->getOperand(1);
41533   SDValue RHS = N->getOperand(2);
41534   SDLoc DL(N);
41535 
41536   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
41537   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
41538   if (!TrueC || !FalseC)
41539     return SDValue();
41540 
41541   // Don't do this for crazy integer types.
41542   EVT VT = N->getValueType(0);
41543   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41544     return SDValue();
41545 
41546   // We're going to use the condition bit in math or logic ops. We could allow
41547   // this with a wider condition value (post-legalization it becomes an i8),
41548   // but if nothing is creating selects that late, it doesn't matter.
41549   if (Cond.getValueType() != MVT::i1)
41550     return SDValue();
41551 
41552   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
41553   // 3, 5, or 9 with i32/i64, so those get transformed too.
41554   // TODO: For constants that overflow or do not differ by power-of-2 or small
41555   // multiplier, convert to 'and' + 'add'.
41556   const APInt &TrueVal = TrueC->getAPIntValue();
41557   const APInt &FalseVal = FalseC->getAPIntValue();
41558   bool OV;
41559   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
41560   if (OV)
41561     return SDValue();
41562 
41563   APInt AbsDiff = Diff.abs();
41564   if (AbsDiff.isPowerOf2() ||
41565       ((VT == MVT::i32 || VT == MVT::i64) &&
41566        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
41567 
41568     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
41569     // of the condition can usually be folded into a compare predicate, but even
41570     // without that, the sequence should be cheaper than a CMOV alternative.
41571     if (TrueVal.slt(FalseVal)) {
41572       Cond = DAG.getNOT(DL, Cond, MVT::i1);
41573       std::swap(TrueC, FalseC);
41574     }
41575 
41576     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
41577     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
41578 
41579     // Multiply condition by the difference if non-one.
41580     if (!AbsDiff.isOneValue())
41581       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
41582 
41583     // Add the base if non-zero.
41584     if (!FalseC->isNullValue())
41585       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
41586 
41587     return R;
41588   }
41589 
41590   return SDValue();
41591 }
41592 
41593 /// If this is a *dynamic* select (non-constant condition) and we can match
41594 /// this node with one of the variable blend instructions, restructure the
41595 /// condition so that blends can use the high (sign) bit of each element.
41596 /// This function will also call SimplifyDemandedBits on already created
41597 /// BLENDV to perform additional simplifications.
41598 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
41599                                            TargetLowering::DAGCombinerInfo &DCI,
41600                                            const X86Subtarget &Subtarget) {
41601   SDValue Cond = N->getOperand(0);
41602   if ((N->getOpcode() != ISD::VSELECT &&
41603        N->getOpcode() != X86ISD::BLENDV) ||
41604       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
41605     return SDValue();
41606 
41607   // Don't optimize before the condition has been transformed to a legal type
41608   // and don't ever optimize vector selects that map to AVX512 mask-registers.
41609   unsigned BitWidth = Cond.getScalarValueSizeInBits();
41610   if (BitWidth < 8 || BitWidth > 64)
41611     return SDValue();
41612 
41613   // We can only handle the cases where VSELECT is directly legal on the
41614   // subtarget. We custom lower VSELECT nodes with constant conditions and
41615   // this makes it hard to see whether a dynamic VSELECT will correctly
41616   // lower, so we both check the operation's status and explicitly handle the
41617   // cases where a *dynamic* blend will fail even though a constant-condition
41618   // blend could be custom lowered.
41619   // FIXME: We should find a better way to handle this class of problems.
41620   // Potentially, we should combine constant-condition vselect nodes
41621   // pre-legalization into shuffles and not mark as many types as custom
41622   // lowered.
41623   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41624   EVT VT = N->getValueType(0);
41625   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
41626     return SDValue();
41627   // FIXME: We don't support i16-element blends currently. We could and
41628   // should support them by making *all* the bits in the condition be set
41629   // rather than just the high bit and using an i8-element blend.
41630   if (VT.getVectorElementType() == MVT::i16)
41631     return SDValue();
41632   // Dynamic blending was only available from SSE4.1 onward.
41633   if (VT.is128BitVector() && !Subtarget.hasSSE41())
41634     return SDValue();
41635   // Byte blends are only available in AVX2
41636   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
41637     return SDValue();
41638   // There are no 512-bit blend instructions that use sign bits.
41639   if (VT.is512BitVector())
41640     return SDValue();
41641 
41642   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
41643     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
41644          UI != UE; ++UI)
41645       if ((UI->getOpcode() != ISD::VSELECT &&
41646            UI->getOpcode() != X86ISD::BLENDV) ||
41647           UI.getOperandNo() != 0)
41648         return false;
41649 
41650     return true;
41651   };
41652 
41653   APInt DemandedBits(APInt::getSignMask(BitWidth));
41654 
41655   if (OnlyUsedAsSelectCond(Cond)) {
41656     KnownBits Known;
41657     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
41658                                           !DCI.isBeforeLegalizeOps());
41659     if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
41660       return SDValue();
41661 
41662     // If we changed the computation somewhere in the DAG, this change will
41663     // affect all users of Cond. Update all the nodes so that we do not use
41664     // the generic VSELECT anymore. Otherwise, we may perform wrong
41665     // optimizations as we messed with the actual expectation for the vector
41666     // boolean values.
41667     for (SDNode *U : Cond->uses()) {
41668       if (U->getOpcode() == X86ISD::BLENDV)
41669         continue;
41670 
41671       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
41672                                Cond, U->getOperand(1), U->getOperand(2));
41673       DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
41674       DCI.AddToWorklist(U);
41675     }
41676     DCI.CommitTargetLoweringOpt(TLO);
41677     return SDValue(N, 0);
41678   }
41679 
41680   // Otherwise we can still at least try to simplify multiple use bits.
41681   if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
41682       return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
41683                          N->getOperand(1), N->getOperand(2));
41684 
41685   return SDValue();
41686 }
41687 
41688 // Try to match:
41689 //   (or (and (M, (sub 0, X)), (pandn M, X)))
41690 // which is a special case of:
41691 //   (select M, (sub 0, X), X)
41692 // Per:
41693 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
41694 // We know that, if fNegate is 0 or 1:
41695 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
41696 //
41697 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
41698 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
41699 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
41700 // This lets us transform our vselect to:
41701 //   (add (xor X, M), (and M, 1))
41702 // And further to:
41703 //   (sub (xor X, M), M)
41704 static SDValue combineLogicBlendIntoConditionalNegate(
41705     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
41706     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41707   EVT MaskVT = Mask.getValueType();
41708   assert(MaskVT.isInteger() &&
41709          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
41710          "Mask must be zero/all-bits");
41711 
41712   if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
41713     return SDValue();
41714   if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
41715     return SDValue();
41716 
41717   auto IsNegV = [](SDNode *N, SDValue V) {
41718     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
41719            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
41720   };
41721 
41722   SDValue V;
41723   if (IsNegV(Y.getNode(), X))
41724     V = X;
41725   else if (IsNegV(X.getNode(), Y))
41726     V = Y;
41727   else
41728     return SDValue();
41729 
41730   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
41731   SDValue SubOp2 = Mask;
41732 
41733   // If the negate was on the false side of the select, then
41734   // the operands of the SUB need to be swapped. PR 27251.
41735   // This is because the pattern being matched above is
41736   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
41737   // but if the pattern matched was
41738   // (vselect M, X, (sub (0, X))), that is really negation of the pattern
41739   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
41740   // pattern also needs to be a negation of the replacement pattern above.
41741   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
41742   // sub accomplishes the negation of the replacement pattern.
41743   if (V == Y)
41744     std::swap(SubOp1, SubOp2);
41745 
41746   SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
41747   return DAG.getBitcast(VT, Res);
41748 }
41749 
41750 /// Do target-specific dag combines on SELECT and VSELECT nodes.
41751 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
41752                              TargetLowering::DAGCombinerInfo &DCI,
41753                              const X86Subtarget &Subtarget) {
41754   SDLoc DL(N);
41755   SDValue Cond = N->getOperand(0);
41756   SDValue LHS = N->getOperand(1);
41757   SDValue RHS = N->getOperand(2);
41758 
41759   // Try simplification again because we use this function to optimize
41760   // BLENDV nodes that are not handled by the generic combiner.
41761   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
41762     return V;
41763 
41764   EVT VT = LHS.getValueType();
41765   EVT CondVT = Cond.getValueType();
41766   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41767   bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
41768 
41769   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
41770   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
41771   // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
41772   if (CondVT.isVector() && CondVT.isInteger() &&
41773       CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
41774       (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
41775       DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
41776     if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
41777                                                            DL, DAG, Subtarget))
41778       return V;
41779 
41780   // Convert vselects with constant condition into shuffles.
41781   if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
41782     SmallVector<int, 64> Mask;
41783     if (createShuffleMaskFromVSELECT(Mask, Cond))
41784       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
41785   }
41786 
41787   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
41788   // by forcing the unselected elements to zero.
41789   // TODO: Can we handle more shuffles with this?
41790   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
41791       LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
41792       LHS.hasOneUse() && RHS.hasOneUse()) {
41793     MVT SimpleVT = VT.getSimpleVT();
41794     SmallVector<SDValue, 1> LHSOps, RHSOps;
41795     SmallVector<int, 64> LHSMask, RHSMask, CondMask;
41796     if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
41797         getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
41798         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
41799       int NumElts = VT.getVectorNumElements();
41800       for (int i = 0; i != NumElts; ++i) {
41801         if (CondMask[i] < NumElts)
41802           RHSMask[i] = 0x80;
41803         else
41804           LHSMask[i] = 0x80;
41805       }
41806       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
41807                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
41808       RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
41809                         getConstVector(RHSMask, SimpleVT, DAG, DL, true));
41810       return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
41811     }
41812   }
41813 
41814   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
41815   // instructions match the semantics of the common C idiom x<y?x:y but not
41816   // x<=y?x:y, because of how they handle negative zero (which can be
41817   // ignored in unsafe-math mode).
41818   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
41819   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
41820       VT != MVT::f80 && VT != MVT::f128 &&
41821       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
41822       (Subtarget.hasSSE2() ||
41823        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
41824     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41825 
41826     unsigned Opcode = 0;
41827     // Check for x CC y ? x : y.
41828     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
41829         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
41830       switch (CC) {
41831       default: break;
41832       case ISD::SETULT:
41833         // Converting this to a min would handle NaNs incorrectly, and swapping
41834         // the operands would cause it to handle comparisons between positive
41835         // and negative zero incorrectly.
41836         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41837           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41838               !(DAG.isKnownNeverZeroFloat(LHS) ||
41839                 DAG.isKnownNeverZeroFloat(RHS)))
41840             break;
41841           std::swap(LHS, RHS);
41842         }
41843         Opcode = X86ISD::FMIN;
41844         break;
41845       case ISD::SETOLE:
41846         // Converting this to a min would handle comparisons between positive
41847         // and negative zero incorrectly.
41848         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41849             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41850           break;
41851         Opcode = X86ISD::FMIN;
41852         break;
41853       case ISD::SETULE:
41854         // Converting this to a min would handle both negative zeros and NaNs
41855         // incorrectly, but we can swap the operands to fix both.
41856         std::swap(LHS, RHS);
41857         LLVM_FALLTHROUGH;
41858       case ISD::SETOLT:
41859       case ISD::SETLT:
41860       case ISD::SETLE:
41861         Opcode = X86ISD::FMIN;
41862         break;
41863 
41864       case ISD::SETOGE:
41865         // Converting this to a max would handle comparisons between positive
41866         // and negative zero incorrectly.
41867         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41868             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41869           break;
41870         Opcode = X86ISD::FMAX;
41871         break;
41872       case ISD::SETUGT:
41873         // Converting this to a max would handle NaNs incorrectly, and swapping
41874         // the operands would cause it to handle comparisons between positive
41875         // and negative zero incorrectly.
41876         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41877           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41878               !(DAG.isKnownNeverZeroFloat(LHS) ||
41879                 DAG.isKnownNeverZeroFloat(RHS)))
41880             break;
41881           std::swap(LHS, RHS);
41882         }
41883         Opcode = X86ISD::FMAX;
41884         break;
41885       case ISD::SETUGE:
41886         // Converting this to a max would handle both negative zeros and NaNs
41887         // incorrectly, but we can swap the operands to fix both.
41888         std::swap(LHS, RHS);
41889         LLVM_FALLTHROUGH;
41890       case ISD::SETOGT:
41891       case ISD::SETGT:
41892       case ISD::SETGE:
41893         Opcode = X86ISD::FMAX;
41894         break;
41895       }
41896     // Check for x CC y ? y : x -- a min/max with reversed arms.
41897     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
41898                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
41899       switch (CC) {
41900       default: break;
41901       case ISD::SETOGE:
41902         // Converting this to a min would handle comparisons between positive
41903         // and negative zero incorrectly, and swapping the operands would
41904         // cause it to handle NaNs incorrectly.
41905         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41906             !(DAG.isKnownNeverZeroFloat(LHS) ||
41907               DAG.isKnownNeverZeroFloat(RHS))) {
41908           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41909             break;
41910           std::swap(LHS, RHS);
41911         }
41912         Opcode = X86ISD::FMIN;
41913         break;
41914       case ISD::SETUGT:
41915         // Converting this to a min would handle NaNs incorrectly.
41916         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41917           break;
41918         Opcode = X86ISD::FMIN;
41919         break;
41920       case ISD::SETUGE:
41921         // Converting this to a min would handle both negative zeros and NaNs
41922         // incorrectly, but we can swap the operands to fix both.
41923         std::swap(LHS, RHS);
41924         LLVM_FALLTHROUGH;
41925       case ISD::SETOGT:
41926       case ISD::SETGT:
41927       case ISD::SETGE:
41928         Opcode = X86ISD::FMIN;
41929         break;
41930 
41931       case ISD::SETULT:
41932         // Converting this to a max would handle NaNs incorrectly.
41933         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41934           break;
41935         Opcode = X86ISD::FMAX;
41936         break;
41937       case ISD::SETOLE:
41938         // Converting this to a max would handle comparisons between positive
41939         // and negative zero incorrectly, and swapping the operands would
41940         // cause it to handle NaNs incorrectly.
41941         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41942             !DAG.isKnownNeverZeroFloat(LHS) &&
41943             !DAG.isKnownNeverZeroFloat(RHS)) {
41944           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41945             break;
41946           std::swap(LHS, RHS);
41947         }
41948         Opcode = X86ISD::FMAX;
41949         break;
41950       case ISD::SETULE:
41951         // Converting this to a max would handle both negative zeros and NaNs
41952         // incorrectly, but we can swap the operands to fix both.
41953         std::swap(LHS, RHS);
41954         LLVM_FALLTHROUGH;
41955       case ISD::SETOLT:
41956       case ISD::SETLT:
41957       case ISD::SETLE:
41958         Opcode = X86ISD::FMAX;
41959         break;
41960       }
41961     }
41962 
41963     if (Opcode)
41964       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
41965   }
41966 
41967   // Some mask scalar intrinsics rely on checking if only one bit is set
41968   // and implement it in C code like this:
41969   // A[0] = (U & 1) ? A[0] : W[0];
41970   // This creates some redundant instructions that break pattern matching.
41971   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
41972   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
41973       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
41974     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41975     SDValue AndNode = Cond.getOperand(0);
41976     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
41977         isNullConstant(Cond.getOperand(1)) &&
41978         isOneConstant(AndNode.getOperand(1))) {
41979       // LHS and RHS swapped due to
41980       // setcc outputting 1 when AND resulted in 0 and vice versa.
41981       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
41982       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
41983     }
41984   }
41985 
41986   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
41987   // lowering on KNL. In this case we convert it to
41988   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
41989   // The same situation all vectors of i8 and i16 without BWI.
41990   // Make sure we extend these even before type legalization gets a chance to
41991   // split wide vectors.
41992   // Since SKX these selects have a proper lowering.
41993   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
41994       CondVT.getVectorElementType() == MVT::i1 &&
41995       (VT.getVectorElementType() == MVT::i8 ||
41996        VT.getVectorElementType() == MVT::i16)) {
41997     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
41998     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
41999   }
42000 
42001   // AVX512 - Extend select with zero to merge with target shuffle.
42002   // select(mask, extract_subvector(shuffle(x)), zero) -->
42003   // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
42004   // TODO - support non target shuffles as well.
42005   if (Subtarget.hasAVX512() && CondVT.isVector() &&
42006       CondVT.getVectorElementType() == MVT::i1) {
42007     auto SelectableOp = [&TLI](SDValue Op) {
42008       return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42009              isTargetShuffle(Op.getOperand(0).getOpcode()) &&
42010              isNullConstant(Op.getOperand(1)) &&
42011              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
42012              Op.hasOneUse() && Op.getOperand(0).hasOneUse();
42013     };
42014 
42015     bool SelectableLHS = SelectableOp(LHS);
42016     bool SelectableRHS = SelectableOp(RHS);
42017     bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
42018     bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
42019 
42020     if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
42021       EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
42022                                 : RHS.getOperand(0).getValueType();
42023       EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
42024       LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
42025                             VT.getSizeInBits());
42026       RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
42027                             VT.getSizeInBits());
42028       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
42029                          DAG.getUNDEF(SrcCondVT), Cond,
42030                          DAG.getIntPtrConstant(0, DL));
42031       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
42032       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
42033     }
42034   }
42035 
42036   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
42037     return V;
42038 
42039   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
42040       Cond.hasOneUse()) {
42041     EVT CondVT = Cond.getValueType();
42042     SDValue Cond0 = Cond.getOperand(0);
42043     SDValue Cond1 = Cond.getOperand(1);
42044     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42045 
42046     // Canonicalize min/max:
42047     // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
42048     // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
42049     // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
42050     // the need for an extra compare against zero. e.g.
42051     // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
42052     // subl   %esi, %edi
42053     // testl  %edi, %edi
42054     // movl   $0, %eax
42055     // cmovgl %edi, %eax
42056     // =>
42057     // xorl   %eax, %eax
42058     // subl   %esi, $edi
42059     // cmovsl %eax, %edi
42060     //
42061     // We can also canonicalize
42062     //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
42063     //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
42064     // This allows the use of a test instruction for the compare.
42065     if (LHS == Cond0 && RHS == Cond1) {
42066       if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
42067           (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
42068         ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
42069         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42070         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42071       }
42072       if (CC == ISD::SETUGT && isOneConstant(RHS)) {
42073         ISD::CondCode NewCC = ISD::SETUGE;
42074         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42075         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42076       }
42077     }
42078 
42079     // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
42080     // fold eq + gt/lt nested selects into ge/le selects
42081     // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
42082     // --> (select (cmpuge Cond0, Cond1), LHS, Y)
42083     // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
42084     // --> (select (cmpsle Cond0, Cond1), LHS, Y)
42085     // .. etc ..
42086     if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
42087         RHS.getOperand(0).getOpcode() == ISD::SETCC) {
42088       SDValue InnerSetCC = RHS.getOperand(0);
42089       ISD::CondCode InnerCC =
42090           cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
42091       if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
42092           Cond0 == InnerSetCC.getOperand(0) &&
42093           Cond1 == InnerSetCC.getOperand(1)) {
42094         ISD::CondCode NewCC;
42095         switch (CC == ISD::SETEQ ? InnerCC : CC) {
42096         case ISD::SETGT:  NewCC = ISD::SETGE; break;
42097         case ISD::SETLT:  NewCC = ISD::SETLE; break;
42098         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
42099         case ISD::SETULT: NewCC = ISD::SETULE; break;
42100         default: NewCC = ISD::SETCC_INVALID; break;
42101         }
42102         if (NewCC != ISD::SETCC_INVALID) {
42103           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
42104           return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
42105         }
42106       }
42107     }
42108   }
42109 
42110   // Check if the first operand is all zeros and Cond type is vXi1.
42111   // If this an avx512 target we can improve the use of zero masking by
42112   // swapping the operands and inverting the condition.
42113   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
42114        Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
42115       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
42116       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
42117     // Invert the cond to not(cond) : xor(op,allones)=not(op)
42118     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
42119     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
42120     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
42121   }
42122 
42123   // Early exit check
42124   if (!TLI.isTypeLegal(VT))
42125     return SDValue();
42126 
42127   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
42128     return V;
42129 
42130   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
42131     return V;
42132 
42133   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
42134     return V;
42135 
42136   // select(~Cond, X, Y) -> select(Cond, Y, X)
42137   if (CondVT.getScalarType() != MVT::i1) {
42138     if (SDValue CondNot = IsNOT(Cond, DAG))
42139       return DAG.getNode(N->getOpcode(), DL, VT,
42140                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
42141     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
42142     if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
42143         ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
42144       Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
42145                          DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
42146       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
42147     }
42148   }
42149 
42150   // Try to optimize vXi1 selects if both operands are either all constants or
42151   // bitcasts from scalar integer type. In that case we can convert the operands
42152   // to integer and use an integer select which will be converted to a CMOV.
42153   // We need to take a little bit of care to avoid creating an i64 type after
42154   // type legalization.
42155   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
42156       VT.getVectorElementType() == MVT::i1 &&
42157       (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
42158     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42159     bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
42160     bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
42161 
42162     if ((LHSIsConst ||
42163          (LHS.getOpcode() == ISD::BITCAST &&
42164           LHS.getOperand(0).getValueType() == IntVT)) &&
42165         (RHSIsConst ||
42166          (RHS.getOpcode() == ISD::BITCAST &&
42167           RHS.getOperand(0).getValueType() == IntVT))) {
42168       if (LHSIsConst)
42169         LHS = combinevXi1ConstantToInteger(LHS, DAG);
42170       else
42171         LHS = LHS.getOperand(0);
42172 
42173       if (RHSIsConst)
42174         RHS = combinevXi1ConstantToInteger(RHS, DAG);
42175       else
42176         RHS = RHS.getOperand(0);
42177 
42178       SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
42179       return DAG.getBitcast(VT, Select);
42180     }
42181   }
42182 
42183   // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
42184   // single bits, then invert the predicate and swap the select operands.
42185   // This can lower using a vector shift bit-hack rather than mask and compare.
42186   if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
42187       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
42188       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
42189       Cond.getOperand(0).getOpcode() == ISD::AND &&
42190       isNullOrNullSplat(Cond.getOperand(1)) &&
42191       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
42192       Cond.getOperand(0).getValueType() == VT) {
42193     // The 'and' mask must be composed of power-of-2 constants.
42194     SDValue And = Cond.getOperand(0);
42195     auto *C = isConstOrConstSplat(And.getOperand(1));
42196     if (C && C->getAPIntValue().isPowerOf2()) {
42197       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
42198       SDValue NotCond =
42199           DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
42200       return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
42201     }
42202 
42203     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
42204     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
42205     // 16-bit lacks a proper blendv.
42206     unsigned EltBitWidth = VT.getScalarSizeInBits();
42207     bool CanShiftBlend =
42208         TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
42209                                 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
42210                                 (Subtarget.hasXOP()));
42211     if (CanShiftBlend &&
42212         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
42213           return C->getAPIntValue().isPowerOf2();
42214         })) {
42215       // Create a left-shift constant to get the mask bits over to the sign-bit.
42216       SDValue Mask = And.getOperand(1);
42217       SmallVector<int, 32> ShlVals;
42218       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
42219         auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
42220         ShlVals.push_back(EltBitWidth - 1 -
42221                           MaskVal->getAPIntValue().exactLogBase2());
42222       }
42223       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
42224       SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
42225       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
42226       SDValue NewCond =
42227           DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
42228       return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
42229     }
42230   }
42231 
42232   return SDValue();
42233 }
42234 
42235 /// Combine:
42236 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
42237 /// to:
42238 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
42239 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
42240 /// Note that this is only legal for some op/cc combinations.
42241 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
42242                                        SelectionDAG &DAG,
42243                                        const X86Subtarget &Subtarget) {
42244   // This combine only operates on CMP-like nodes.
42245   if (!(Cmp.getOpcode() == X86ISD::CMP ||
42246         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42247     return SDValue();
42248 
42249   // Can't replace the cmp if it has more uses than the one we're looking at.
42250   // FIXME: We would like to be able to handle this, but would need to make sure
42251   // all uses were updated.
42252   if (!Cmp.hasOneUse())
42253     return SDValue();
42254 
42255   // This only applies to variations of the common case:
42256   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
42257   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
42258   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
42259   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
42260   // Using the proper condcodes (see below), overflow is checked for.
42261 
42262   // FIXME: We can generalize both constraints:
42263   // - XOR/OR/AND (if they were made to survive AtomicExpand)
42264   // - LHS != 1
42265   // if the result is compared.
42266 
42267   SDValue CmpLHS = Cmp.getOperand(0);
42268   SDValue CmpRHS = Cmp.getOperand(1);
42269   EVT CmpVT = CmpLHS.getValueType();
42270 
42271   if (!CmpLHS.hasOneUse())
42272     return SDValue();
42273 
42274   unsigned Opc = CmpLHS.getOpcode();
42275   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
42276     return SDValue();
42277 
42278   SDValue OpRHS = CmpLHS.getOperand(2);
42279   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
42280   if (!OpRHSC)
42281     return SDValue();
42282 
42283   APInt Addend = OpRHSC->getAPIntValue();
42284   if (Opc == ISD::ATOMIC_LOAD_SUB)
42285     Addend = -Addend;
42286 
42287   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
42288   if (!CmpRHSC)
42289     return SDValue();
42290 
42291   APInt Comparison = CmpRHSC->getAPIntValue();
42292   APInt NegAddend = -Addend;
42293 
42294   // See if we can adjust the CC to make the comparison match the negated
42295   // addend.
42296   if (Comparison != NegAddend) {
42297     APInt IncComparison = Comparison + 1;
42298     if (IncComparison == NegAddend) {
42299       if (CC == X86::COND_A && !Comparison.isMaxValue()) {
42300         Comparison = IncComparison;
42301         CC = X86::COND_AE;
42302       } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
42303         Comparison = IncComparison;
42304         CC = X86::COND_L;
42305       }
42306     }
42307     APInt DecComparison = Comparison - 1;
42308     if (DecComparison == NegAddend) {
42309       if (CC == X86::COND_AE && !Comparison.isMinValue()) {
42310         Comparison = DecComparison;
42311         CC = X86::COND_A;
42312       } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
42313         Comparison = DecComparison;
42314         CC = X86::COND_LE;
42315       }
42316     }
42317   }
42318 
42319   // If the addend is the negation of the comparison value, then we can do
42320   // a full comparison by emitting the atomic arithmetic as a locked sub.
42321   if (Comparison == NegAddend) {
42322     // The CC is fine, but we need to rewrite the LHS of the comparison as an
42323     // atomic sub.
42324     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
42325     auto AtomicSub = DAG.getAtomic(
42326         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
42327         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
42328         /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
42329         AN->getMemOperand());
42330     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
42331     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42332     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42333     return LockOp;
42334   }
42335 
42336   // We can handle comparisons with zero in a number of cases by manipulating
42337   // the CC used.
42338   if (!Comparison.isNullValue())
42339     return SDValue();
42340 
42341   if (CC == X86::COND_S && Addend == 1)
42342     CC = X86::COND_LE;
42343   else if (CC == X86::COND_NS && Addend == 1)
42344     CC = X86::COND_G;
42345   else if (CC == X86::COND_G && Addend == -1)
42346     CC = X86::COND_GE;
42347   else if (CC == X86::COND_LE && Addend == -1)
42348     CC = X86::COND_L;
42349   else
42350     return SDValue();
42351 
42352   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
42353   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42354   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42355   return LockOp;
42356 }
42357 
42358 // Check whether a boolean test is testing a boolean value generated by
42359 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
42360 // code.
42361 //
42362 // Simplify the following patterns:
42363 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
42364 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
42365 // to (Op EFLAGS Cond)
42366 //
42367 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
42368 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
42369 // to (Op EFLAGS !Cond)
42370 //
42371 // where Op could be BRCOND or CMOV.
42372 //
42373 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
42374   // This combine only operates on CMP-like nodes.
42375   if (!(Cmp.getOpcode() == X86ISD::CMP ||
42376         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42377     return SDValue();
42378 
42379   // Quit if not used as a boolean value.
42380   if (CC != X86::COND_E && CC != X86::COND_NE)
42381     return SDValue();
42382 
42383   // Check CMP operands. One of them should be 0 or 1 and the other should be
42384   // an SetCC or extended from it.
42385   SDValue Op1 = Cmp.getOperand(0);
42386   SDValue Op2 = Cmp.getOperand(1);
42387 
42388   SDValue SetCC;
42389   const ConstantSDNode* C = nullptr;
42390   bool needOppositeCond = (CC == X86::COND_E);
42391   bool checkAgainstTrue = false; // Is it a comparison against 1?
42392 
42393   if ((C = dyn_cast<ConstantSDNode>(Op1)))
42394     SetCC = Op2;
42395   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
42396     SetCC = Op1;
42397   else // Quit if all operands are not constants.
42398     return SDValue();
42399 
42400   if (C->getZExtValue() == 1) {
42401     needOppositeCond = !needOppositeCond;
42402     checkAgainstTrue = true;
42403   } else if (C->getZExtValue() != 0)
42404     // Quit if the constant is neither 0 or 1.
42405     return SDValue();
42406 
42407   bool truncatedToBoolWithAnd = false;
42408   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
42409   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
42410          SetCC.getOpcode() == ISD::TRUNCATE ||
42411          SetCC.getOpcode() == ISD::AND) {
42412     if (SetCC.getOpcode() == ISD::AND) {
42413       int OpIdx = -1;
42414       if (isOneConstant(SetCC.getOperand(0)))
42415         OpIdx = 1;
42416       if (isOneConstant(SetCC.getOperand(1)))
42417         OpIdx = 0;
42418       if (OpIdx < 0)
42419         break;
42420       SetCC = SetCC.getOperand(OpIdx);
42421       truncatedToBoolWithAnd = true;
42422     } else
42423       SetCC = SetCC.getOperand(0);
42424   }
42425 
42426   switch (SetCC.getOpcode()) {
42427   case X86ISD::SETCC_CARRY:
42428     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
42429     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
42430     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
42431     // truncated to i1 using 'and'.
42432     if (checkAgainstTrue && !truncatedToBoolWithAnd)
42433       break;
42434     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
42435            "Invalid use of SETCC_CARRY!");
42436     LLVM_FALLTHROUGH;
42437   case X86ISD::SETCC:
42438     // Set the condition code or opposite one if necessary.
42439     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
42440     if (needOppositeCond)
42441       CC = X86::GetOppositeBranchCondition(CC);
42442     return SetCC.getOperand(1);
42443   case X86ISD::CMOV: {
42444     // Check whether false/true value has canonical one, i.e. 0 or 1.
42445     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
42446     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
42447     // Quit if true value is not a constant.
42448     if (!TVal)
42449       return SDValue();
42450     // Quit if false value is not a constant.
42451     if (!FVal) {
42452       SDValue Op = SetCC.getOperand(0);
42453       // Skip 'zext' or 'trunc' node.
42454       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
42455           Op.getOpcode() == ISD::TRUNCATE)
42456         Op = Op.getOperand(0);
42457       // A special case for rdrand/rdseed, where 0 is set if false cond is
42458       // found.
42459       if ((Op.getOpcode() != X86ISD::RDRAND &&
42460            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
42461         return SDValue();
42462     }
42463     // Quit if false value is not the constant 0 or 1.
42464     bool FValIsFalse = true;
42465     if (FVal && FVal->getZExtValue() != 0) {
42466       if (FVal->getZExtValue() != 1)
42467         return SDValue();
42468       // If FVal is 1, opposite cond is needed.
42469       needOppositeCond = !needOppositeCond;
42470       FValIsFalse = false;
42471     }
42472     // Quit if TVal is not the constant opposite of FVal.
42473     if (FValIsFalse && TVal->getZExtValue() != 1)
42474       return SDValue();
42475     if (!FValIsFalse && TVal->getZExtValue() != 0)
42476       return SDValue();
42477     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
42478     if (needOppositeCond)
42479       CC = X86::GetOppositeBranchCondition(CC);
42480     return SetCC.getOperand(3);
42481   }
42482   }
42483 
42484   return SDValue();
42485 }
42486 
42487 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
42488 /// Match:
42489 ///   (X86or (X86setcc) (X86setcc))
42490 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
42491 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
42492                                            X86::CondCode &CC1, SDValue &Flags,
42493                                            bool &isAnd) {
42494   if (Cond->getOpcode() == X86ISD::CMP) {
42495     if (!isNullConstant(Cond->getOperand(1)))
42496       return false;
42497 
42498     Cond = Cond->getOperand(0);
42499   }
42500 
42501   isAnd = false;
42502 
42503   SDValue SetCC0, SetCC1;
42504   switch (Cond->getOpcode()) {
42505   default: return false;
42506   case ISD::AND:
42507   case X86ISD::AND:
42508     isAnd = true;
42509     LLVM_FALLTHROUGH;
42510   case ISD::OR:
42511   case X86ISD::OR:
42512     SetCC0 = Cond->getOperand(0);
42513     SetCC1 = Cond->getOperand(1);
42514     break;
42515   };
42516 
42517   // Make sure we have SETCC nodes, using the same flags value.
42518   if (SetCC0.getOpcode() != X86ISD::SETCC ||
42519       SetCC1.getOpcode() != X86ISD::SETCC ||
42520       SetCC0->getOperand(1) != SetCC1->getOperand(1))
42521     return false;
42522 
42523   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
42524   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
42525   Flags = SetCC0->getOperand(1);
42526   return true;
42527 }
42528 
42529 // When legalizing carry, we create carries via add X, -1
42530 // If that comes from an actual carry, via setcc, we use the
42531 // carry directly.
42532 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
42533   if (EFLAGS.getOpcode() == X86ISD::ADD) {
42534     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
42535       SDValue Carry = EFLAGS.getOperand(0);
42536       while (Carry.getOpcode() == ISD::TRUNCATE ||
42537              Carry.getOpcode() == ISD::ZERO_EXTEND ||
42538              Carry.getOpcode() == ISD::SIGN_EXTEND ||
42539              Carry.getOpcode() == ISD::ANY_EXTEND ||
42540              (Carry.getOpcode() == ISD::AND &&
42541               isOneConstant(Carry.getOperand(1))))
42542         Carry = Carry.getOperand(0);
42543       if (Carry.getOpcode() == X86ISD::SETCC ||
42544           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
42545         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
42546         uint64_t CarryCC = Carry.getConstantOperandVal(0);
42547         SDValue CarryOp1 = Carry.getOperand(1);
42548         if (CarryCC == X86::COND_B)
42549           return CarryOp1;
42550         if (CarryCC == X86::COND_A) {
42551           // Try to convert COND_A into COND_B in an attempt to facilitate
42552           // materializing "setb reg".
42553           //
42554           // Do not flip "e > c", where "c" is a constant, because Cmp
42555           // instruction cannot take an immediate as its first operand.
42556           //
42557           if (CarryOp1.getOpcode() == X86ISD::SUB &&
42558               CarryOp1.getNode()->hasOneUse() &&
42559               CarryOp1.getValueType().isInteger() &&
42560               !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
42561             SDValue SubCommute =
42562                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
42563                             CarryOp1.getOperand(1), CarryOp1.getOperand(0));
42564             return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
42565           }
42566         }
42567         // If this is a check of the z flag of an add with 1, switch to the
42568         // C flag.
42569         if (CarryCC == X86::COND_E &&
42570             CarryOp1.getOpcode() == X86ISD::ADD &&
42571             isOneConstant(CarryOp1.getOperand(1)))
42572           return CarryOp1;
42573       }
42574     }
42575   }
42576 
42577   return SDValue();
42578 }
42579 
42580 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
42581 /// to avoid the inversion.
42582 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
42583                               SelectionDAG &DAG,
42584                               const X86Subtarget &Subtarget) {
42585   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
42586   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
42587       EFLAGS.getOpcode() != X86ISD::TESTP)
42588     return SDValue();
42589 
42590   // PTEST/TESTP sets EFLAGS as:
42591   // TESTZ: ZF = (Op0 & Op1) == 0
42592   // TESTC: CF = (~Op0 & Op1) == 0
42593   // TESTNZC: ZF == 0 && CF == 0
42594   EVT VT = EFLAGS.getValueType();
42595   SDValue Op0 = EFLAGS.getOperand(0);
42596   SDValue Op1 = EFLAGS.getOperand(1);
42597   EVT OpVT = Op0.getValueType();
42598 
42599   // TEST*(~X,Y) == TEST*(X,Y)
42600   if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
42601     X86::CondCode InvCC;
42602     switch (CC) {
42603     case X86::COND_B:
42604       // testc -> testz.
42605       InvCC = X86::COND_E;
42606       break;
42607     case X86::COND_AE:
42608       // !testc -> !testz.
42609       InvCC = X86::COND_NE;
42610       break;
42611     case X86::COND_E:
42612       // testz -> testc.
42613       InvCC = X86::COND_B;
42614       break;
42615     case X86::COND_NE:
42616       // !testz -> !testc.
42617       InvCC = X86::COND_AE;
42618       break;
42619     case X86::COND_A:
42620     case X86::COND_BE:
42621       // testnzc -> testnzc (no change).
42622       InvCC = CC;
42623       break;
42624     default:
42625       InvCC = X86::COND_INVALID;
42626       break;
42627     }
42628 
42629     if (InvCC != X86::COND_INVALID) {
42630       CC = InvCC;
42631       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42632                          DAG.getBitcast(OpVT, NotOp0), Op1);
42633     }
42634   }
42635 
42636   if (CC == X86::COND_E || CC == X86::COND_NE) {
42637     // TESTZ(X,~Y) == TESTC(Y,X)
42638     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
42639       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42640       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42641                          DAG.getBitcast(OpVT, NotOp1), Op0);
42642     }
42643 
42644     if (Op0 == Op1) {
42645       SDValue BC = peekThroughBitcasts(Op0);
42646       EVT BCVT = BC.getValueType();
42647       assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
42648              "Unexpected vector type");
42649 
42650       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
42651       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
42652         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42653                            DAG.getBitcast(OpVT, BC.getOperand(0)),
42654                            DAG.getBitcast(OpVT, BC.getOperand(1)));
42655       }
42656 
42657       // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
42658       if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
42659         CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42660         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42661                            DAG.getBitcast(OpVT, BC.getOperand(0)),
42662                            DAG.getBitcast(OpVT, BC.getOperand(1)));
42663       }
42664 
42665       // If every element is an all-sign value, see if we can use MOVMSK to
42666       // more efficiently extract the sign bits and compare that.
42667       // TODO: Handle TESTC with comparison inversion.
42668       // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
42669       // MOVMSK combines to make sure its never worse than PTEST?
42670       unsigned EltBits = BCVT.getScalarSizeInBits();
42671       if (DAG.ComputeNumSignBits(BC) == EltBits) {
42672         assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
42673         APInt SignMask = APInt::getSignMask(EltBits);
42674         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42675         if (SDValue Res =
42676                 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
42677           // For vXi16 cases we need to use pmovmksb and extract every other
42678           // sign bit.
42679           SDLoc DL(EFLAGS);
42680           if (EltBits == 16) {
42681             MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
42682             Res = DAG.getBitcast(MovmskVT, Res);
42683             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42684             Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
42685                               DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42686           } else {
42687             Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42688           }
42689           return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
42690                              DAG.getConstant(0, DL, MVT::i32));
42691         }
42692       }
42693     }
42694 
42695     // TESTZ(-1,X) == TESTZ(X,X)
42696     if (ISD::isBuildVectorAllOnes(Op0.getNode()))
42697       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
42698 
42699     // TESTZ(X,-1) == TESTZ(X,X)
42700     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
42701       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
42702   }
42703 
42704   return SDValue();
42705 }
42706 
42707 // Attempt to simplify the MOVMSK input based on the comparison type.
42708 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
42709                                   SelectionDAG &DAG,
42710                                   const X86Subtarget &Subtarget) {
42711   // Handle eq/ne against zero (any_of).
42712   // Handle eq/ne against -1 (all_of).
42713   if (!(CC == X86::COND_E || CC == X86::COND_NE))
42714     return SDValue();
42715   if (EFLAGS.getValueType() != MVT::i32)
42716     return SDValue();
42717   unsigned CmpOpcode = EFLAGS.getOpcode();
42718   if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
42719     return SDValue();
42720   auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
42721   if (!CmpConstant)
42722     return SDValue();
42723   const APInt &CmpVal = CmpConstant->getAPIntValue();
42724 
42725   SDValue CmpOp = EFLAGS.getOperand(0);
42726   unsigned CmpBits = CmpOp.getValueSizeInBits();
42727   assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
42728 
42729   // Peek through any truncate.
42730   if (CmpOp.getOpcode() == ISD::TRUNCATE)
42731     CmpOp = CmpOp.getOperand(0);
42732 
42733   // Bail if we don't find a MOVMSK.
42734   if (CmpOp.getOpcode() != X86ISD::MOVMSK)
42735     return SDValue();
42736 
42737   SDValue Vec = CmpOp.getOperand(0);
42738   MVT VecVT = Vec.getSimpleValueType();
42739   assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
42740          "Unexpected MOVMSK operand");
42741   unsigned NumElts = VecVT.getVectorNumElements();
42742   unsigned NumEltBits = VecVT.getScalarSizeInBits();
42743 
42744   bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
42745   bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
42746                  CmpVal.isMask(NumElts);
42747   if (!IsAnyOf && !IsAllOf)
42748     return SDValue();
42749 
42750   // See if we can peek through to a vector with a wider element type, if the
42751   // signbits extend down to all the sub-elements as well.
42752   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
42753   // potential SimplifyDemandedBits/Elts cases.
42754   if (Vec.getOpcode() == ISD::BITCAST) {
42755     SDValue BC = peekThroughBitcasts(Vec);
42756     MVT BCVT = BC.getSimpleValueType();
42757     unsigned BCNumElts = BCVT.getVectorNumElements();
42758     unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
42759     if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
42760         BCNumEltBits > NumEltBits &&
42761         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
42762       SDLoc DL(EFLAGS);
42763       unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
42764       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
42765                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
42766                          DAG.getConstant(CmpMask, DL, MVT::i32));
42767     }
42768   }
42769 
42770   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
42771   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
42772   if (IsAllOf && Subtarget.hasSSE41()) {
42773     SDValue BC = peekThroughBitcasts(Vec);
42774     if (BC.getOpcode() == X86ISD::PCMPEQ &&
42775         ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
42776       MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
42777       SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
42778       return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
42779     }
42780   }
42781 
42782   // See if we can avoid a PACKSS by calling MOVMSK on the sources.
42783   // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
42784   // sign bits prior to the comparison with zero unless we know that
42785   // the vXi16 splats the sign bit down to the lower i8 half.
42786   // TODO: Handle all_of patterns.
42787   if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
42788     SDValue VecOp0 = Vec.getOperand(0);
42789     SDValue VecOp1 = Vec.getOperand(1);
42790     bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
42791     bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
42792     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
42793     if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
42794       SDLoc DL(EFLAGS);
42795       SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
42796       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42797       Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
42798       if (!SignExt0) {
42799         Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
42800                              DAG.getConstant(0xAAAA, DL, MVT::i16));
42801       }
42802       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42803                          DAG.getConstant(0, DL, MVT::i16));
42804     }
42805     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
42806     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
42807     if (CmpBits >= 16 && Subtarget.hasInt256() &&
42808         VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42809         VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42810         VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
42811         VecOp0.getConstantOperandAPInt(1) == 0 &&
42812         VecOp1.getConstantOperandAPInt(1) == 8 &&
42813         (IsAnyOf || (SignExt0 && SignExt1))) {
42814       SDLoc DL(EFLAGS);
42815       SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
42816       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42817       unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
42818       if (!SignExt0 || !SignExt1) {
42819         assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
42820         Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
42821                              DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42822       }
42823       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42824                          DAG.getConstant(CmpMask, DL, MVT::i32));
42825     }
42826   }
42827 
42828   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
42829   SmallVector<int, 32> ShuffleMask;
42830   SmallVector<SDValue, 2> ShuffleInputs;
42831   if (NumElts <= CmpBits &&
42832       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
42833                              ShuffleMask, DAG) &&
42834       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
42835       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
42836     unsigned NumShuffleElts = ShuffleMask.size();
42837     APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
42838     for (int M : ShuffleMask) {
42839       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
42840       DemandedElts.setBit(M);
42841     }
42842     if (DemandedElts.isAllOnesValue()) {
42843       SDLoc DL(EFLAGS);
42844       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
42845       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42846       Result =
42847           DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
42848       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42849                          EFLAGS.getOperand(1));
42850     }
42851   }
42852 
42853   return SDValue();
42854 }
42855 
42856 /// Optimize an EFLAGS definition used according to the condition code \p CC
42857 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
42858 /// uses of chain values.
42859 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
42860                                   SelectionDAG &DAG,
42861                                   const X86Subtarget &Subtarget) {
42862   if (CC == X86::COND_B)
42863     if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
42864       return Flags;
42865 
42866   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
42867     return R;
42868 
42869   if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
42870     return R;
42871 
42872   if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
42873     return R;
42874 
42875   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
42876 }
42877 
42878 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
42879 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
42880                            TargetLowering::DAGCombinerInfo &DCI,
42881                            const X86Subtarget &Subtarget) {
42882   SDLoc DL(N);
42883 
42884   SDValue FalseOp = N->getOperand(0);
42885   SDValue TrueOp = N->getOperand(1);
42886   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
42887   SDValue Cond = N->getOperand(3);
42888 
42889   // cmov X, X, ?, ? --> X
42890   if (TrueOp == FalseOp)
42891     return TrueOp;
42892 
42893   // Try to simplify the EFLAGS and condition code operands.
42894   // We can't always do this as FCMOV only supports a subset of X86 cond.
42895   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
42896     if (!(FalseOp.getValueType() == MVT::f80 ||
42897           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
42898           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
42899         !Subtarget.hasCMov() || hasFPCMov(CC)) {
42900       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
42901                        Flags};
42902       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42903     }
42904   }
42905 
42906   // If this is a select between two integer constants, try to do some
42907   // optimizations.  Note that the operands are ordered the opposite of SELECT
42908   // operands.
42909   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
42910     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
42911       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
42912       // larger than FalseC (the false value).
42913       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
42914         CC = X86::GetOppositeBranchCondition(CC);
42915         std::swap(TrueC, FalseC);
42916         std::swap(TrueOp, FalseOp);
42917       }
42918 
42919       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
42920       // This is efficient for any integer data type (including i8/i16) and
42921       // shift amount.
42922       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
42923         Cond = getSETCC(CC, Cond, DL, DAG);
42924 
42925         // Zero extend the condition if needed.
42926         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
42927 
42928         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
42929         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
42930                            DAG.getConstant(ShAmt, DL, MVT::i8));
42931         return Cond;
42932       }
42933 
42934       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
42935       // for any integer data type, including i8/i16.
42936       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
42937         Cond = getSETCC(CC, Cond, DL, DAG);
42938 
42939         // Zero extend the condition if needed.
42940         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
42941                            FalseC->getValueType(0), Cond);
42942         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42943                            SDValue(FalseC, 0));
42944         return Cond;
42945       }
42946 
42947       // Optimize cases that will turn into an LEA instruction.  This requires
42948       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
42949       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
42950         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
42951         assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
42952                "Implicit constant truncation");
42953 
42954         bool isFastMultiplier = false;
42955         if (Diff.ult(10)) {
42956           switch (Diff.getZExtValue()) {
42957           default: break;
42958           case 1:  // result = add base, cond
42959           case 2:  // result = lea base(    , cond*2)
42960           case 3:  // result = lea base(cond, cond*2)
42961           case 4:  // result = lea base(    , cond*4)
42962           case 5:  // result = lea base(cond, cond*4)
42963           case 8:  // result = lea base(    , cond*8)
42964           case 9:  // result = lea base(cond, cond*8)
42965             isFastMultiplier = true;
42966             break;
42967           }
42968         }
42969 
42970         if (isFastMultiplier) {
42971           Cond = getSETCC(CC, Cond, DL ,DAG);
42972           // Zero extend the condition if needed.
42973           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
42974                              Cond);
42975           // Scale the condition by the difference.
42976           if (Diff != 1)
42977             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
42978                                DAG.getConstant(Diff, DL, Cond.getValueType()));
42979 
42980           // Add the base if non-zero.
42981           if (FalseC->getAPIntValue() != 0)
42982             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42983                                SDValue(FalseC, 0));
42984           return Cond;
42985         }
42986       }
42987     }
42988   }
42989 
42990   // Handle these cases:
42991   //   (select (x != c), e, c) -> select (x != c), e, x),
42992   //   (select (x == c), c, e) -> select (x == c), x, e)
42993   // where the c is an integer constant, and the "select" is the combination
42994   // of CMOV and CMP.
42995   //
42996   // The rationale for this change is that the conditional-move from a constant
42997   // needs two instructions, however, conditional-move from a register needs
42998   // only one instruction.
42999   //
43000   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
43001   //  some instruction-combining opportunities. This opt needs to be
43002   //  postponed as late as possible.
43003   //
43004   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
43005     // the DCI.xxxx conditions are provided to postpone the optimization as
43006     // late as possible.
43007 
43008     ConstantSDNode *CmpAgainst = nullptr;
43009     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
43010         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
43011         !isa<ConstantSDNode>(Cond.getOperand(0))) {
43012 
43013       if (CC == X86::COND_NE &&
43014           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
43015         CC = X86::GetOppositeBranchCondition(CC);
43016         std::swap(TrueOp, FalseOp);
43017       }
43018 
43019       if (CC == X86::COND_E &&
43020           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
43021         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
43022                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
43023         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43024       }
43025     }
43026   }
43027 
43028   // Fold and/or of setcc's to double CMOV:
43029   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
43030   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
43031   //
43032   // This combine lets us generate:
43033   //   cmovcc1 (jcc1 if we don't have CMOV)
43034   //   cmovcc2 (same)
43035   // instead of:
43036   //   setcc1
43037   //   setcc2
43038   //   and/or
43039   //   cmovne (jne if we don't have CMOV)
43040   // When we can't use the CMOV instruction, it might increase branch
43041   // mispredicts.
43042   // When we can use CMOV, or when there is no mispredict, this improves
43043   // throughput and reduces register pressure.
43044   //
43045   if (CC == X86::COND_NE) {
43046     SDValue Flags;
43047     X86::CondCode CC0, CC1;
43048     bool isAndSetCC;
43049     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
43050       if (isAndSetCC) {
43051         std::swap(FalseOp, TrueOp);
43052         CC0 = X86::GetOppositeBranchCondition(CC0);
43053         CC1 = X86::GetOppositeBranchCondition(CC1);
43054       }
43055 
43056       SDValue LOps[] = {FalseOp, TrueOp,
43057                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
43058       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
43059       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
43060                        Flags};
43061       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43062       return CMOV;
43063     }
43064   }
43065 
43066   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
43067   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
43068   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
43069   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
43070   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
43071       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
43072     SDValue Add = TrueOp;
43073     SDValue Const = FalseOp;
43074     // Canonicalize the condition code for easier matching and output.
43075     if (CC == X86::COND_E)
43076       std::swap(Add, Const);
43077 
43078     // We might have replaced the constant in the cmov with the LHS of the
43079     // compare. If so change it to the RHS of the compare.
43080     if (Const == Cond.getOperand(0))
43081       Const = Cond.getOperand(1);
43082 
43083     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
43084     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
43085         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
43086         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
43087          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
43088         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
43089       EVT VT = N->getValueType(0);
43090       // This should constant fold.
43091       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
43092       SDValue CMov =
43093           DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
43094                       DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
43095       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
43096     }
43097   }
43098 
43099   return SDValue();
43100 }
43101 
43102 /// Different mul shrinking modes.
43103 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
43104 
43105 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
43106   EVT VT = N->getOperand(0).getValueType();
43107   if (VT.getScalarSizeInBits() != 32)
43108     return false;
43109 
43110   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
43111   unsigned SignBits[2] = {1, 1};
43112   bool IsPositive[2] = {false, false};
43113   for (unsigned i = 0; i < 2; i++) {
43114     SDValue Opd = N->getOperand(i);
43115 
43116     SignBits[i] = DAG.ComputeNumSignBits(Opd);
43117     IsPositive[i] = DAG.SignBitIsZero(Opd);
43118   }
43119 
43120   bool AllPositive = IsPositive[0] && IsPositive[1];
43121   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
43122   // When ranges are from -128 ~ 127, use MULS8 mode.
43123   if (MinSignBits >= 25)
43124     Mode = ShrinkMode::MULS8;
43125   // When ranges are from 0 ~ 255, use MULU8 mode.
43126   else if (AllPositive && MinSignBits >= 24)
43127     Mode = ShrinkMode::MULU8;
43128   // When ranges are from -32768 ~ 32767, use MULS16 mode.
43129   else if (MinSignBits >= 17)
43130     Mode = ShrinkMode::MULS16;
43131   // When ranges are from 0 ~ 65535, use MULU16 mode.
43132   else if (AllPositive && MinSignBits >= 16)
43133     Mode = ShrinkMode::MULU16;
43134   else
43135     return false;
43136   return true;
43137 }
43138 
43139 /// When the operands of vector mul are extended from smaller size values,
43140 /// like i8 and i16, the type of mul may be shrinked to generate more
43141 /// efficient code. Two typical patterns are handled:
43142 /// Pattern1:
43143 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
43144 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
43145 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43146 ///     %5 = mul <N x i32> %2, %4
43147 ///
43148 /// Pattern2:
43149 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
43150 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
43151 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43152 ///     %5 = mul <N x i32> %2, %4
43153 ///
43154 /// There are four mul shrinking modes:
43155 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
43156 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
43157 /// generate pmullw+sext32 for it (MULS8 mode).
43158 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
43159 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
43160 /// generate pmullw+zext32 for it (MULU8 mode).
43161 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
43162 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
43163 /// generate pmullw+pmulhw for it (MULS16 mode).
43164 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
43165 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
43166 /// generate pmullw+pmulhuw for it (MULU16 mode).
43167 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
43168                                const X86Subtarget &Subtarget) {
43169   // Check for legality
43170   // pmullw/pmulhw are not supported by SSE.
43171   if (!Subtarget.hasSSE2())
43172     return SDValue();
43173 
43174   // Check for profitability
43175   // pmulld is supported since SSE41. It is better to use pmulld
43176   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
43177   // the expansion.
43178   bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
43179   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
43180     return SDValue();
43181 
43182   ShrinkMode Mode;
43183   if (!canReduceVMulWidth(N, DAG, Mode))
43184     return SDValue();
43185 
43186   SDLoc DL(N);
43187   SDValue N0 = N->getOperand(0);
43188   SDValue N1 = N->getOperand(1);
43189   EVT VT = N->getOperand(0).getValueType();
43190   unsigned NumElts = VT.getVectorNumElements();
43191   if ((NumElts % 2) != 0)
43192     return SDValue();
43193 
43194   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
43195 
43196   // Shrink the operands of mul.
43197   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
43198   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
43199 
43200   // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
43201   // lower part is needed.
43202   SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
43203   if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
43204     return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
43205                                                    : ISD::SIGN_EXTEND,
43206                        DL, VT, MulLo);
43207 
43208   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
43209   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
43210   // the higher part is also needed.
43211   SDValue MulHi =
43212       DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
43213                   ReducedVT, NewN0, NewN1);
43214 
43215   // Repack the lower part and higher part result of mul into a wider
43216   // result.
43217   // Generate shuffle functioning as punpcklwd.
43218   SmallVector<int, 16> ShuffleMask(NumElts);
43219   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43220     ShuffleMask[2 * i] = i;
43221     ShuffleMask[2 * i + 1] = i + NumElts;
43222   }
43223   SDValue ResLo =
43224       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43225   ResLo = DAG.getBitcast(ResVT, ResLo);
43226   // Generate shuffle functioning as punpckhwd.
43227   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43228     ShuffleMask[2 * i] = i + NumElts / 2;
43229     ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
43230   }
43231   SDValue ResHi =
43232       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43233   ResHi = DAG.getBitcast(ResVT, ResHi);
43234   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
43235 }
43236 
43237 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
43238                                  EVT VT, const SDLoc &DL) {
43239 
43240   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
43241     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43242                                  DAG.getConstant(Mult, DL, VT));
43243     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
43244                          DAG.getConstant(Shift, DL, MVT::i8));
43245     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43246                          N->getOperand(0));
43247     return Result;
43248   };
43249 
43250   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
43251     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43252                                  DAG.getConstant(Mul1, DL, VT));
43253     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
43254                          DAG.getConstant(Mul2, DL, VT));
43255     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43256                          N->getOperand(0));
43257     return Result;
43258   };
43259 
43260   switch (MulAmt) {
43261   default:
43262     break;
43263   case 11:
43264     // mul x, 11 => add ((shl (mul x, 5), 1), x)
43265     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
43266   case 21:
43267     // mul x, 21 => add ((shl (mul x, 5), 2), x)
43268     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
43269   case 41:
43270     // mul x, 41 => add ((shl (mul x, 5), 3), x)
43271     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
43272   case 22:
43273     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
43274     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43275                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
43276   case 19:
43277     // mul x, 19 => add ((shl (mul x, 9), 1), x)
43278     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
43279   case 37:
43280     // mul x, 37 => add ((shl (mul x, 9), 2), x)
43281     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
43282   case 73:
43283     // mul x, 73 => add ((shl (mul x, 9), 3), x)
43284     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
43285   case 13:
43286     // mul x, 13 => add ((shl (mul x, 3), 2), x)
43287     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
43288   case 23:
43289     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
43290     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
43291   case 26:
43292     // mul x, 26 => add ((mul (mul x, 5), 5), x)
43293     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
43294   case 28:
43295     // mul x, 28 => add ((mul (mul x, 9), 3), x)
43296     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
43297   case 29:
43298     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
43299     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43300                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
43301   }
43302 
43303   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
43304   // by a single LEA.
43305   // First check if this a sum of two power of 2s because that's easy. Then
43306   // count how many zeros are up to the first bit.
43307   // TODO: We can do this even without LEA at a cost of two shifts and an add.
43308   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
43309     unsigned ScaleShift = countTrailingZeros(MulAmt);
43310     if (ScaleShift >= 1 && ScaleShift < 4) {
43311       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
43312       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43313                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
43314       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43315                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
43316       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
43317     }
43318   }
43319 
43320   return SDValue();
43321 }
43322 
43323 // If the upper 17 bits of each element are zero then we can use PMADDWD,
43324 // which is always at least as quick as PMULLD, except on KNL.
43325 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
43326                                    const X86Subtarget &Subtarget) {
43327   if (!Subtarget.hasSSE2())
43328     return SDValue();
43329 
43330   if (Subtarget.isPMADDWDSlow())
43331     return SDValue();
43332 
43333   EVT VT = N->getValueType(0);
43334 
43335   // Only support vXi32 vectors.
43336   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
43337     return SDValue();
43338 
43339   // Make sure the type is legal or will be widened to a legal type.
43340   if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
43341     return SDValue();
43342 
43343   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
43344 
43345   // Without BWI, we would need to split v32i16.
43346   if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
43347     return SDValue();
43348 
43349   SDValue N0 = N->getOperand(0);
43350   SDValue N1 = N->getOperand(1);
43351 
43352   // If we are zero extending two steps without SSE4.1, its better to reduce
43353   // the vmul width instead.
43354   if (!Subtarget.hasSSE41() &&
43355       (N0.getOpcode() == ISD::ZERO_EXTEND &&
43356        N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
43357       (N1.getOpcode() == ISD::ZERO_EXTEND &&
43358        N1.getOperand(0).getScalarValueSizeInBits() <= 8))
43359     return SDValue();
43360 
43361   APInt Mask17 = APInt::getHighBitsSet(32, 17);
43362   if (!DAG.MaskedValueIsZero(N1, Mask17) ||
43363       !DAG.MaskedValueIsZero(N0, Mask17))
43364     return SDValue();
43365 
43366   // Use SplitOpsAndApply to handle AVX splitting.
43367   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43368                            ArrayRef<SDValue> Ops) {
43369     MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43370     return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43371   };
43372   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
43373                           { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
43374                           PMADDWDBuilder);
43375 }
43376 
43377 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
43378                                   const X86Subtarget &Subtarget) {
43379   if (!Subtarget.hasSSE2())
43380     return SDValue();
43381 
43382   EVT VT = N->getValueType(0);
43383 
43384   // Only support vXi64 vectors.
43385   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
43386       VT.getVectorNumElements() < 2 ||
43387       !isPowerOf2_32(VT.getVectorNumElements()))
43388     return SDValue();
43389 
43390   SDValue N0 = N->getOperand(0);
43391   SDValue N1 = N->getOperand(1);
43392 
43393   // MULDQ returns the 64-bit result of the signed multiplication of the lower
43394   // 32-bits. We can lower with this if the sign bits stretch that far.
43395   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
43396       DAG.ComputeNumSignBits(N1) > 32) {
43397     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43398                             ArrayRef<SDValue> Ops) {
43399       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
43400     };
43401     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43402                             PMULDQBuilder, /*CheckBWI*/false);
43403   }
43404 
43405   // If the upper bits are zero we can use a single pmuludq.
43406   APInt Mask = APInt::getHighBitsSet(64, 32);
43407   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
43408     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43409                              ArrayRef<SDValue> Ops) {
43410       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
43411     };
43412     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43413                             PMULUDQBuilder, /*CheckBWI*/false);
43414   }
43415 
43416   return SDValue();
43417 }
43418 
43419 /// Optimize a single multiply with constant into two operations in order to
43420 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
43421 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
43422                           TargetLowering::DAGCombinerInfo &DCI,
43423                           const X86Subtarget &Subtarget) {
43424   EVT VT = N->getValueType(0);
43425 
43426   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
43427     return V;
43428 
43429   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
43430     return V;
43431 
43432   if (DCI.isBeforeLegalize() && VT.isVector())
43433     return reduceVMULWidth(N, DAG, Subtarget);
43434 
43435   if (!MulConstantOptimization)
43436     return SDValue();
43437   // An imul is usually smaller than the alternative sequence.
43438   if (DAG.getMachineFunction().getFunction().hasMinSize())
43439     return SDValue();
43440 
43441   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
43442     return SDValue();
43443 
43444   if (VT != MVT::i64 && VT != MVT::i32)
43445     return SDValue();
43446 
43447   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
43448   if (!C)
43449     return SDValue();
43450   if (isPowerOf2_64(C->getZExtValue()))
43451     return SDValue();
43452 
43453   int64_t SignMulAmt = C->getSExtValue();
43454   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
43455   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
43456 
43457   SDLoc DL(N);
43458   if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
43459     SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43460                                  DAG.getConstant(AbsMulAmt, DL, VT));
43461     if (SignMulAmt < 0)
43462       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43463                            NewMul);
43464 
43465     return NewMul;
43466   }
43467 
43468   uint64_t MulAmt1 = 0;
43469   uint64_t MulAmt2 = 0;
43470   if ((AbsMulAmt % 9) == 0) {
43471     MulAmt1 = 9;
43472     MulAmt2 = AbsMulAmt / 9;
43473   } else if ((AbsMulAmt % 5) == 0) {
43474     MulAmt1 = 5;
43475     MulAmt2 = AbsMulAmt / 5;
43476   } else if ((AbsMulAmt % 3) == 0) {
43477     MulAmt1 = 3;
43478     MulAmt2 = AbsMulAmt / 3;
43479   }
43480 
43481   SDValue NewMul;
43482   // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
43483   if (MulAmt2 &&
43484       (isPowerOf2_64(MulAmt2) ||
43485        (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
43486 
43487     if (isPowerOf2_64(MulAmt2) &&
43488         !(SignMulAmt >= 0 && N->hasOneUse() &&
43489           N->use_begin()->getOpcode() == ISD::ADD))
43490       // If second multiplifer is pow2, issue it first. We want the multiply by
43491       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
43492       // is an add. Only do this for positive multiply amounts since the
43493       // negate would prevent it from being used as an address mode anyway.
43494       std::swap(MulAmt1, MulAmt2);
43495 
43496     if (isPowerOf2_64(MulAmt1))
43497       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43498                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
43499     else
43500       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43501                            DAG.getConstant(MulAmt1, DL, VT));
43502 
43503     if (isPowerOf2_64(MulAmt2))
43504       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
43505                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
43506     else
43507       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
43508                            DAG.getConstant(MulAmt2, DL, VT));
43509 
43510     // Negate the result.
43511     if (SignMulAmt < 0)
43512       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43513                            NewMul);
43514   } else if (!Subtarget.slowLEA())
43515     NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
43516 
43517   if (!NewMul) {
43518     assert(C->getZExtValue() != 0 &&
43519            C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
43520            "Both cases that could cause potential overflows should have "
43521            "already been handled.");
43522     if (isPowerOf2_64(AbsMulAmt - 1)) {
43523       // (mul x, 2^N + 1) => (add (shl x, N), x)
43524       NewMul = DAG.getNode(
43525           ISD::ADD, DL, VT, N->getOperand(0),
43526           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43527                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
43528                                       MVT::i8)));
43529       // To negate, subtract the number from zero
43530       if (SignMulAmt < 0)
43531         NewMul = DAG.getNode(ISD::SUB, DL, VT,
43532                              DAG.getConstant(0, DL, VT), NewMul);
43533     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
43534       // (mul x, 2^N - 1) => (sub (shl x, N), x)
43535       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43536                            DAG.getConstant(Log2_64(AbsMulAmt + 1),
43537                                            DL, MVT::i8));
43538       // To negate, reverse the operands of the subtract.
43539       if (SignMulAmt < 0)
43540         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
43541       else
43542         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43543     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
43544       // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
43545       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43546                            DAG.getConstant(Log2_64(AbsMulAmt - 2),
43547                                            DL, MVT::i8));
43548       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43549       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43550     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
43551       // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
43552       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43553                            DAG.getConstant(Log2_64(AbsMulAmt + 2),
43554                                            DL, MVT::i8));
43555       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43556       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43557     }
43558   }
43559 
43560   return NewMul;
43561 }
43562 
43563 // Try to form a MULHU or MULHS node by looking for
43564 // (srl (mul ext, ext), 16)
43565 // TODO: This is X86 specific because we want to be able to handle wide types
43566 // before type legalization. But we can only do it if the vector will be
43567 // legalized via widening/splitting. Type legalization can't handle promotion
43568 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43569 // combiner.
43570 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
43571                                    const X86Subtarget &Subtarget) {
43572   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
43573            "SRL or SRA node is required here!");
43574   SDLoc DL(N);
43575 
43576   // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
43577   // the multiply.
43578   if (!Subtarget.hasSSE41())
43579     return SDValue();
43580 
43581   // The operation feeding into the shift must be a multiply.
43582   SDValue ShiftOperand = N->getOperand(0);
43583   if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
43584     return SDValue();
43585 
43586   // Input type should be at least vXi32.
43587   EVT VT = N->getValueType(0);
43588   if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
43589     return SDValue();
43590 
43591   // Need a shift by 16.
43592   APInt ShiftAmt;
43593   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
43594       ShiftAmt != 16)
43595     return SDValue();
43596 
43597   SDValue LHS = ShiftOperand.getOperand(0);
43598   SDValue RHS = ShiftOperand.getOperand(1);
43599 
43600   unsigned ExtOpc = LHS.getOpcode();
43601   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43602       RHS.getOpcode() != ExtOpc)
43603     return SDValue();
43604 
43605   // Peek through the extends.
43606   LHS = LHS.getOperand(0);
43607   RHS = RHS.getOperand(0);
43608 
43609   // Ensure the input types match.
43610   EVT MulVT = LHS.getValueType();
43611   if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
43612     return SDValue();
43613 
43614   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43615   SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
43616 
43617   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43618   return DAG.getNode(ExtOpc, DL, VT, Mulh);
43619 }
43620 
43621 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
43622   SDValue N0 = N->getOperand(0);
43623   SDValue N1 = N->getOperand(1);
43624   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
43625   EVT VT = N0.getValueType();
43626 
43627   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
43628   // since the result of setcc_c is all zero's or all ones.
43629   if (VT.isInteger() && !VT.isVector() &&
43630       N1C && N0.getOpcode() == ISD::AND &&
43631       N0.getOperand(1).getOpcode() == ISD::Constant) {
43632     SDValue N00 = N0.getOperand(0);
43633     APInt Mask = N0.getConstantOperandAPInt(1);
43634     Mask <<= N1C->getAPIntValue();
43635     bool MaskOK = false;
43636     // We can handle cases concerning bit-widening nodes containing setcc_c if
43637     // we carefully interrogate the mask to make sure we are semantics
43638     // preserving.
43639     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
43640     // of the underlying setcc_c operation if the setcc_c was zero extended.
43641     // Consider the following example:
43642     //   zext(setcc_c)                 -> i32 0x0000FFFF
43643     //   c1                            -> i32 0x0000FFFF
43644     //   c2                            -> i32 0x00000001
43645     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
43646     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
43647     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
43648       MaskOK = true;
43649     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
43650                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43651       MaskOK = true;
43652     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
43653                 N00.getOpcode() == ISD::ANY_EXTEND) &&
43654                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43655       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
43656     }
43657     if (MaskOK && Mask != 0) {
43658       SDLoc DL(N);
43659       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
43660     }
43661   }
43662 
43663   // Hardware support for vector shifts is sparse which makes us scalarize the
43664   // vector operations in many cases. Also, on sandybridge ADD is faster than
43665   // shl.
43666   // (shl V, 1) -> add V,V
43667   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
43668     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
43669       assert(N0.getValueType().isVector() && "Invalid vector shift type");
43670       // We shift all of the values by one. In many cases we do not have
43671       // hardware support for this operation. This is better expressed as an ADD
43672       // of two values.
43673       if (N1SplatC->isOne())
43674         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
43675     }
43676 
43677   return SDValue();
43678 }
43679 
43680 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
43681                                            const X86Subtarget &Subtarget) {
43682   SDValue N0 = N->getOperand(0);
43683   SDValue N1 = N->getOperand(1);
43684   EVT VT = N0.getValueType();
43685   unsigned Size = VT.getSizeInBits();
43686 
43687   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43688     return V;
43689 
43690   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
43691   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
43692   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
43693   // depending on sign of (SarConst - [56,48,32,24,16])
43694 
43695   // sexts in X86 are MOVs. The MOVs have the same code size
43696   // as above SHIFTs (only SHIFT on 1 has lower code size).
43697   // However the MOVs have 2 advantages to a SHIFT:
43698   // 1. MOVs can write to a register that differs from source
43699   // 2. MOVs accept memory operands
43700 
43701   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
43702       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
43703       N0.getOperand(1).getOpcode() != ISD::Constant)
43704     return SDValue();
43705 
43706   SDValue N00 = N0.getOperand(0);
43707   SDValue N01 = N0.getOperand(1);
43708   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
43709   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
43710   EVT CVT = N1.getValueType();
43711 
43712   if (SarConst.isNegative())
43713     return SDValue();
43714 
43715   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
43716     unsigned ShiftSize = SVT.getSizeInBits();
43717     // skipping types without corresponding sext/zext and
43718     // ShlConst that is not one of [56,48,32,24,16]
43719     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
43720       continue;
43721     SDLoc DL(N);
43722     SDValue NN =
43723         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
43724     SarConst = SarConst - (Size - ShiftSize);
43725     if (SarConst == 0)
43726       return NN;
43727     else if (SarConst.isNegative())
43728       return DAG.getNode(ISD::SHL, DL, VT, NN,
43729                          DAG.getConstant(-SarConst, DL, CVT));
43730     else
43731       return DAG.getNode(ISD::SRA, DL, VT, NN,
43732                          DAG.getConstant(SarConst, DL, CVT));
43733   }
43734   return SDValue();
43735 }
43736 
43737 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
43738                                         TargetLowering::DAGCombinerInfo &DCI,
43739                                         const X86Subtarget &Subtarget) {
43740   SDValue N0 = N->getOperand(0);
43741   SDValue N1 = N->getOperand(1);
43742   EVT VT = N0.getValueType();
43743 
43744   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43745     return V;
43746 
43747   // Only do this on the last DAG combine as it can interfere with other
43748   // combines.
43749   if (!DCI.isAfterLegalizeDAG())
43750     return SDValue();
43751 
43752   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
43753   // TODO: This is a generic DAG combine that became an x86-only combine to
43754   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
43755   // and-not ('andn').
43756   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
43757     return SDValue();
43758 
43759   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
43760   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
43761   if (!ShiftC || !AndC)
43762     return SDValue();
43763 
43764   // If we can shrink the constant mask below 8-bits or 32-bits, then this
43765   // transform should reduce code size. It may also enable secondary transforms
43766   // from improved known-bits analysis or instruction selection.
43767   APInt MaskVal = AndC->getAPIntValue();
43768 
43769   // If this can be matched by a zero extend, don't optimize.
43770   if (MaskVal.isMask()) {
43771     unsigned TO = MaskVal.countTrailingOnes();
43772     if (TO >= 8 && isPowerOf2_32(TO))
43773       return SDValue();
43774   }
43775 
43776   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
43777   unsigned OldMaskSize = MaskVal.getMinSignedBits();
43778   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
43779   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
43780       (OldMaskSize > 32 && NewMaskSize <= 32)) {
43781     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
43782     SDLoc DL(N);
43783     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
43784     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
43785     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
43786   }
43787   return SDValue();
43788 }
43789 
43790 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
43791                                          const X86Subtarget &Subtarget) {
43792   unsigned Opcode = N->getOpcode();
43793   assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
43794 
43795   SDLoc DL(N);
43796   EVT VT = N->getValueType(0);
43797   SDValue N0 = N->getOperand(0);
43798   SDValue N1 = N->getOperand(1);
43799   EVT SrcVT = N0.getValueType();
43800 
43801   SDValue BC0 =
43802       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
43803   SDValue BC1 =
43804       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
43805 
43806   // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
43807   // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
43808   // truncation trees that help us avoid lane crossing shuffles.
43809   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
43810   // TODO: We don't handle vXf64 shuffles yet.
43811   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
43812       BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43813       BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43814       BC0.getOperand(0) == BC1.getOperand(0) &&
43815       BC0.getOperand(0).getValueType().is256BitVector() &&
43816       BC0.getConstantOperandAPInt(1) == 0 &&
43817       BC1.getConstantOperandAPInt(1) ==
43818           BC0.getValueType().getVectorNumElements()) {
43819     SmallVector<SDValue> ShuffleOps;
43820     SmallVector<int> ShuffleMask, ScaledMask;
43821     SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
43822     if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
43823       resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
43824       // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
43825       // shuffle to a v4X64 width - we can probably relax this in the future.
43826       if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
43827           ShuffleOps[0].getValueType().is256BitVector() &&
43828           scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
43829         SDValue Lo, Hi;
43830         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43831         std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
43832         Lo = DAG.getBitcast(SrcVT, Lo);
43833         Hi = DAG.getBitcast(SrcVT, Hi);
43834         SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
43835         Res = DAG.getBitcast(ShufVT, Res);
43836         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
43837         return DAG.getBitcast(VT, Res);
43838       }
43839     }
43840   }
43841 
43842   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
43843   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
43844     // If either/both ops are a shuffle that can scale to v2x64,
43845     // then see if we can perform this as a v4x32 post shuffle.
43846     SmallVector<SDValue> Ops0, Ops1;
43847     SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
43848     bool IsShuf0 =
43849         getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43850         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43851         all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43852     bool IsShuf1 =
43853         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43854         scaleShuffleElements(Mask1, 2, ScaledMask1) &&
43855         all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43856     if (IsShuf0 || IsShuf1) {
43857       if (!IsShuf0) {
43858         Ops0.assign({BC0});
43859         ScaledMask0.assign({0, 1});
43860       }
43861       if (!IsShuf1) {
43862         Ops1.assign({BC1});
43863         ScaledMask1.assign({0, 1});
43864       }
43865 
43866       SDValue LHS, RHS;
43867       int PostShuffle[4] = {-1, -1, -1, -1};
43868       auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
43869         if (M < 0)
43870           return true;
43871         Idx = M % 2;
43872         SDValue Src = Ops[M / 2];
43873         if (!LHS || LHS == Src) {
43874           LHS = Src;
43875           return true;
43876         }
43877         if (!RHS || RHS == Src) {
43878           Idx += 2;
43879           RHS = Src;
43880           return true;
43881         }
43882         return false;
43883       };
43884       if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
43885           FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
43886           FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
43887           FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
43888         LHS = DAG.getBitcast(SrcVT, LHS);
43889         RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
43890         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43891         SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
43892         Res = DAG.getBitcast(ShufVT, Res);
43893         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
43894         return DAG.getBitcast(VT, Res);
43895       }
43896     }
43897   }
43898 
43899   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
43900   if (VT.is256BitVector() && Subtarget.hasInt256()) {
43901     SmallVector<int> Mask0, Mask1;
43902     SmallVector<SDValue> Ops0, Ops1;
43903     SmallVector<int, 2> ScaledMask0, ScaledMask1;
43904     if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43905         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43906         !Ops0.empty() && !Ops1.empty() &&
43907         all_of(Ops0,
43908                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43909         all_of(Ops1,
43910                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43911         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43912         scaleShuffleElements(Mask1, 2, ScaledMask1)) {
43913       SDValue Op00 = peekThroughBitcasts(Ops0.front());
43914       SDValue Op10 = peekThroughBitcasts(Ops1.front());
43915       SDValue Op01 = peekThroughBitcasts(Ops0.back());
43916       SDValue Op11 = peekThroughBitcasts(Ops1.back());
43917       if ((Op00 == Op11) && (Op01 == Op10)) {
43918         std::swap(Op10, Op11);
43919         ShuffleVectorSDNode::commuteMask(ScaledMask1);
43920       }
43921       if ((Op00 == Op10) && (Op01 == Op11)) {
43922         const int Map[4] = {0, 2, 1, 3};
43923         SmallVector<int, 4> ShuffleMask(
43924             {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
43925              Map[ScaledMask1[1]]});
43926         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
43927         SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
43928                                   DAG.getBitcast(SrcVT, Op01));
43929         Res = DAG.getBitcast(ShufVT, Res);
43930         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
43931         return DAG.getBitcast(VT, Res);
43932       }
43933     }
43934   }
43935 
43936   return SDValue();
43937 }
43938 
43939 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
43940                                  TargetLowering::DAGCombinerInfo &DCI,
43941                                  const X86Subtarget &Subtarget) {
43942   unsigned Opcode = N->getOpcode();
43943   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
43944          "Unexpected pack opcode");
43945 
43946   EVT VT = N->getValueType(0);
43947   SDValue N0 = N->getOperand(0);
43948   SDValue N1 = N->getOperand(1);
43949   unsigned NumDstElts = VT.getVectorNumElements();
43950   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
43951   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
43952   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
43953          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
43954          "Unexpected PACKSS/PACKUS input type");
43955 
43956   bool IsSigned = (X86ISD::PACKSS == Opcode);
43957 
43958   // Constant Folding.
43959   APInt UndefElts0, UndefElts1;
43960   SmallVector<APInt, 32> EltBits0, EltBits1;
43961   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
43962       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
43963       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
43964       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
43965     unsigned NumLanes = VT.getSizeInBits() / 128;
43966     unsigned NumSrcElts = NumDstElts / 2;
43967     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
43968     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
43969 
43970     APInt Undefs(NumDstElts, 0);
43971     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
43972     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
43973       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
43974         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
43975         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
43976         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
43977 
43978         if (UndefElts[SrcIdx]) {
43979           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
43980           continue;
43981         }
43982 
43983         APInt &Val = EltBits[SrcIdx];
43984         if (IsSigned) {
43985           // PACKSS: Truncate signed value with signed saturation.
43986           // Source values less than dst minint are saturated to minint.
43987           // Source values greater than dst maxint are saturated to maxint.
43988           if (Val.isSignedIntN(DstBitsPerElt))
43989             Val = Val.trunc(DstBitsPerElt);
43990           else if (Val.isNegative())
43991             Val = APInt::getSignedMinValue(DstBitsPerElt);
43992           else
43993             Val = APInt::getSignedMaxValue(DstBitsPerElt);
43994         } else {
43995           // PACKUS: Truncate signed value with unsigned saturation.
43996           // Source values less than zero are saturated to zero.
43997           // Source values greater than dst maxuint are saturated to maxuint.
43998           if (Val.isIntN(DstBitsPerElt))
43999             Val = Val.trunc(DstBitsPerElt);
44000           else if (Val.isNegative())
44001             Val = APInt::getNullValue(DstBitsPerElt);
44002           else
44003             Val = APInt::getAllOnesValue(DstBitsPerElt);
44004         }
44005         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
44006       }
44007     }
44008 
44009     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
44010   }
44011 
44012   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
44013   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44014     return V;
44015 
44016   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
44017   // truncate to create a larger truncate.
44018   if (Subtarget.hasAVX512() &&
44019       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
44020       N0.getOperand(0).getValueType() == MVT::v8i32) {
44021     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
44022         (!IsSigned &&
44023          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
44024       if (Subtarget.hasVLX())
44025         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
44026 
44027       // Widen input to v16i32 so we can truncate that.
44028       SDLoc dl(N);
44029       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
44030                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
44031       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
44032     }
44033   }
44034 
44035   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
44036   if (VT.is128BitVector()) {
44037     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44038     SDValue Src0, Src1;
44039     if (N0.getOpcode() == ExtOpc &&
44040         N0.getOperand(0).getValueType().is64BitVector() &&
44041         N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44042       Src0 = N0.getOperand(0);
44043     }
44044     if (N1.getOpcode() == ExtOpc &&
44045         N1.getOperand(0).getValueType().is64BitVector() &&
44046         N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44047       Src1 = N1.getOperand(0);
44048     }
44049     if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
44050       assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
44051       Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
44052       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
44053       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
44054     }
44055   }
44056 
44057   // Attempt to combine as shuffle.
44058   SDValue Op(N, 0);
44059   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44060     return Res;
44061 
44062   return SDValue();
44063 }
44064 
44065 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
44066                                     TargetLowering::DAGCombinerInfo &DCI,
44067                                     const X86Subtarget &Subtarget) {
44068   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
44069           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
44070          "Unexpected horizontal add/sub opcode");
44071 
44072   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
44073     // For slow-hop targets, if we have a hop with a single op, see if we already
44074     // have another user that we can reuse and shuffle the result.
44075     MVT VT = N->getSimpleValueType(0);
44076     SDValue LHS = N->getOperand(0);
44077     SDValue RHS = N->getOperand(1);
44078     if (VT.is128BitVector() && LHS == RHS) {
44079       for (SDNode *User : LHS->uses()) {
44080         if (User != N && User->getOpcode() == N->getOpcode()) {
44081           MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44082           if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
44083             return DAG.getBitcast(
44084                 VT,
44085                 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44086                                      DAG.getBitcast(ShufVT, SDValue(User, 0)),
44087                                      DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
44088           }
44089           if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
44090             return DAG.getBitcast(
44091                 VT,
44092                 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44093                                      DAG.getBitcast(ShufVT, SDValue(User, 0)),
44094                                      DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
44095           }
44096         }
44097       }
44098     }
44099 
44100     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
44101     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
44102         LHS.getOpcode() == RHS.getOpcode() &&
44103         LHS.getValueType() == RHS.getValueType()) {
44104       SDValue LHS0 = LHS.getOperand(0);
44105       SDValue RHS0 = LHS.getOperand(1);
44106       SDValue LHS1 = RHS.getOperand(0);
44107       SDValue RHS1 = RHS.getOperand(1);
44108       if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
44109           (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
44110         SDLoc DL(N);
44111         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
44112                                   LHS0.isUndef() ? RHS0 : LHS0,
44113                                   LHS1.isUndef() ? RHS1 : LHS1);
44114         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
44115         Res = DAG.getBitcast(ShufVT, Res);
44116         SDValue NewLHS =
44117             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44118                         getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
44119         SDValue NewRHS =
44120             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44121                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
44122         DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
44123         DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
44124         return SDValue(N, 0);
44125       }
44126     }
44127   }
44128 
44129   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
44130   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44131     return V;
44132 
44133   return SDValue();
44134 }
44135 
44136 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
44137                                      TargetLowering::DAGCombinerInfo &DCI,
44138                                      const X86Subtarget &Subtarget) {
44139   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
44140           X86ISD::VSRL == N->getOpcode()) &&
44141          "Unexpected shift opcode");
44142   EVT VT = N->getValueType(0);
44143   SDValue N0 = N->getOperand(0);
44144   SDValue N1 = N->getOperand(1);
44145 
44146   // Shift zero -> zero.
44147   if (ISD::isBuildVectorAllZeros(N0.getNode()))
44148     return DAG.getConstant(0, SDLoc(N), VT);
44149 
44150   // Detect constant shift amounts.
44151   APInt UndefElts;
44152   SmallVector<APInt, 32> EltBits;
44153   if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
44154     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
44155     return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
44156                                       EltBits[0].getZExtValue(), DAG);
44157   }
44158 
44159   APInt KnownUndef, KnownZero;
44160   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44161   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44162   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44163                                      KnownZero, DCI))
44164     return SDValue(N, 0);
44165 
44166   return SDValue();
44167 }
44168 
44169 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
44170                                      TargetLowering::DAGCombinerInfo &DCI,
44171                                      const X86Subtarget &Subtarget) {
44172   unsigned Opcode = N->getOpcode();
44173   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
44174           X86ISD::VSRLI == Opcode) &&
44175          "Unexpected shift opcode");
44176   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
44177   EVT VT = N->getValueType(0);
44178   SDValue N0 = N->getOperand(0);
44179   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44180   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
44181          "Unexpected value type");
44182   assert(N->getOperand(1).getValueType() == MVT::i8 &&
44183          "Unexpected shift amount type");
44184 
44185   // (shift undef, X) -> 0
44186   if (N0.isUndef())
44187     return DAG.getConstant(0, SDLoc(N), VT);
44188 
44189   // Out of range logical bit shifts are guaranteed to be zero.
44190   // Out of range arithmetic bit shifts splat the sign bit.
44191   unsigned ShiftVal = N->getConstantOperandVal(1);
44192   if (ShiftVal >= NumBitsPerElt) {
44193     if (LogicalShift)
44194       return DAG.getConstant(0, SDLoc(N), VT);
44195     ShiftVal = NumBitsPerElt - 1;
44196   }
44197 
44198   // (shift X, 0) -> X
44199   if (!ShiftVal)
44200     return N0;
44201 
44202   // (shift 0, C) -> 0
44203   if (ISD::isBuildVectorAllZeros(N0.getNode()))
44204     // N0 is all zeros or undef. We guarantee that the bits shifted into the
44205     // result are all zeros, not undef.
44206     return DAG.getConstant(0, SDLoc(N), VT);
44207 
44208   // (VSRAI -1, C) -> -1
44209   if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
44210     // N0 is all ones or undef. We guarantee that the bits shifted into the
44211     // result are all ones, not undef.
44212     return DAG.getConstant(-1, SDLoc(N), VT);
44213 
44214   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
44215   if (Opcode == N0.getOpcode()) {
44216     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
44217     unsigned NewShiftVal = ShiftVal + ShiftVal2;
44218     if (NewShiftVal >= NumBitsPerElt) {
44219       // Out of range logical bit shifts are guaranteed to be zero.
44220       // Out of range arithmetic bit shifts splat the sign bit.
44221       if (LogicalShift)
44222         return DAG.getConstant(0, SDLoc(N), VT);
44223       NewShiftVal = NumBitsPerElt - 1;
44224     }
44225     return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
44226                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
44227   }
44228 
44229   // We can decode 'whole byte' logical bit shifts as shuffles.
44230   if (LogicalShift && (ShiftVal % 8) == 0) {
44231     SDValue Op(N, 0);
44232     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44233       return Res;
44234   }
44235 
44236   // Constant Folding.
44237   APInt UndefElts;
44238   SmallVector<APInt, 32> EltBits;
44239   if (N->isOnlyUserOf(N0.getNode()) &&
44240       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
44241     assert(EltBits.size() == VT.getVectorNumElements() &&
44242            "Unexpected shift value type");
44243     // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
44244     // created an undef input due to no input bits being demanded, but user
44245     // still expects 0 in other bits.
44246     for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
44247       APInt &Elt = EltBits[i];
44248       if (UndefElts[i])
44249         Elt = 0;
44250       else if (X86ISD::VSHLI == Opcode)
44251         Elt <<= ShiftVal;
44252       else if (X86ISD::VSRAI == Opcode)
44253         Elt.ashrInPlace(ShiftVal);
44254       else
44255         Elt.lshrInPlace(ShiftVal);
44256     }
44257     // Reset undef elements since they were zeroed above.
44258     UndefElts = 0;
44259     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
44260   }
44261 
44262   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44263   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44264                                APInt::getAllOnesValue(NumBitsPerElt), DCI))
44265     return SDValue(N, 0);
44266 
44267   return SDValue();
44268 }
44269 
44270 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
44271                                    TargetLowering::DAGCombinerInfo &DCI,
44272                                    const X86Subtarget &Subtarget) {
44273   EVT VT = N->getValueType(0);
44274   assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
44275           (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
44276           N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
44277          "Unexpected vector insertion");
44278 
44279   if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
44280     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44281     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44282     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44283                                  APInt::getAllOnesValue(NumBitsPerElt), DCI))
44284       return SDValue(N, 0);
44285   }
44286 
44287   // Attempt to combine insertion patterns to a shuffle.
44288   if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
44289     SDValue Op(N, 0);
44290     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44291       return Res;
44292   }
44293 
44294   return SDValue();
44295 }
44296 
44297 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
44298 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
44299 /// OR -> CMPNEQSS.
44300 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
44301                                    TargetLowering::DAGCombinerInfo &DCI,
44302                                    const X86Subtarget &Subtarget) {
44303   unsigned opcode;
44304 
44305   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
44306   // we're requiring SSE2 for both.
44307   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
44308     SDValue N0 = N->getOperand(0);
44309     SDValue N1 = N->getOperand(1);
44310     SDValue CMP0 = N0.getOperand(1);
44311     SDValue CMP1 = N1.getOperand(1);
44312     SDLoc DL(N);
44313 
44314     // The SETCCs should both refer to the same CMP.
44315     if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
44316       return SDValue();
44317 
44318     SDValue CMP00 = CMP0->getOperand(0);
44319     SDValue CMP01 = CMP0->getOperand(1);
44320     EVT     VT    = CMP00.getValueType();
44321 
44322     if (VT == MVT::f32 || VT == MVT::f64) {
44323       bool ExpectingFlags = false;
44324       // Check for any users that want flags:
44325       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
44326            !ExpectingFlags && UI != UE; ++UI)
44327         switch (UI->getOpcode()) {
44328         default:
44329         case ISD::BR_CC:
44330         case ISD::BRCOND:
44331         case ISD::SELECT:
44332           ExpectingFlags = true;
44333           break;
44334         case ISD::CopyToReg:
44335         case ISD::SIGN_EXTEND:
44336         case ISD::ZERO_EXTEND:
44337         case ISD::ANY_EXTEND:
44338           break;
44339         }
44340 
44341       if (!ExpectingFlags) {
44342         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
44343         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
44344 
44345         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
44346           X86::CondCode tmp = cc0;
44347           cc0 = cc1;
44348           cc1 = tmp;
44349         }
44350 
44351         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
44352             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
44353           // FIXME: need symbolic constants for these magic numbers.
44354           // See X86ATTInstPrinter.cpp:printSSECC().
44355           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
44356           if (Subtarget.hasAVX512()) {
44357             SDValue FSetCC =
44358                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
44359                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
44360             // Need to fill with zeros to ensure the bitcast will produce zeroes
44361             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
44362             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
44363                                       DAG.getConstant(0, DL, MVT::v16i1),
44364                                       FSetCC, DAG.getIntPtrConstant(0, DL));
44365             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
44366                                       N->getSimpleValueType(0));
44367           }
44368           SDValue OnesOrZeroesF =
44369               DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
44370                           CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
44371 
44372           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
44373           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
44374 
44375           if (is64BitFP && !Subtarget.is64Bit()) {
44376             // On a 32-bit target, we cannot bitcast the 64-bit float to a
44377             // 64-bit integer, since that's not a legal type. Since
44378             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
44379             // bits, but can do this little dance to extract the lowest 32 bits
44380             // and work with those going forward.
44381             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
44382                                            OnesOrZeroesF);
44383             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
44384             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
44385                                         Vector32, DAG.getIntPtrConstant(0, DL));
44386             IntVT = MVT::i32;
44387           }
44388 
44389           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
44390           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
44391                                       DAG.getConstant(1, DL, IntVT));
44392           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
44393                                               ANDed);
44394           return OneBitOfTruth;
44395         }
44396       }
44397     }
44398   }
44399   return SDValue();
44400 }
44401 
44402 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
44403 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
44404   assert(N->getOpcode() == ISD::AND);
44405 
44406   MVT VT = N->getSimpleValueType(0);
44407   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
44408     return SDValue();
44409 
44410   SDValue X, Y;
44411   SDValue N0 = N->getOperand(0);
44412   SDValue N1 = N->getOperand(1);
44413 
44414   auto GetNot = [&VT, &DAG](SDValue V) {
44415     // Basic X = NOT(Y) detection.
44416     if (SDValue Not = IsNOT(V, DAG))
44417       return Not;
44418     // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
44419     if (V.getOpcode() == X86ISD::VBROADCAST) {
44420       SDValue Src = V.getOperand(0);
44421       EVT SrcVT = Src.getValueType();
44422       if (!SrcVT.isVector())
44423         return SDValue();
44424       if (SDValue Not = IsNOT(Src, DAG))
44425         return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
44426                            DAG.getBitcast(SrcVT, Not));
44427     }
44428     return SDValue();
44429   };
44430 
44431   if (SDValue Not = GetNot(N0)) {
44432     X = Not;
44433     Y = N1;
44434   } else if (SDValue Not = GetNot(N1)) {
44435     X = Not;
44436     Y = N0;
44437   } else
44438     return SDValue();
44439 
44440   X = DAG.getBitcast(VT, X);
44441   Y = DAG.getBitcast(VT, Y);
44442   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
44443 }
44444 
44445 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
44446 // logical operations, like in the example below.
44447 //   or (and (truncate x, truncate y)),
44448 //      (xor (truncate z, build_vector (constants)))
44449 // Given a target type \p VT, we generate
44450 //   or (and x, y), (xor z, zext(build_vector (constants)))
44451 // given x, y and z are of type \p VT. We can do so, if operands are either
44452 // truncates from VT types, the second operand is a vector of constants or can
44453 // be recursively promoted.
44454 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
44455                                      unsigned Depth) {
44456   // Limit recursion to avoid excessive compile times.
44457   if (Depth >= SelectionDAG::MaxRecursionDepth)
44458     return SDValue();
44459 
44460   if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
44461       N->getOpcode() != ISD::OR)
44462     return SDValue();
44463 
44464   SDValue N0 = N->getOperand(0);
44465   SDValue N1 = N->getOperand(1);
44466   SDLoc DL(N);
44467 
44468   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44469   if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
44470     return SDValue();
44471 
44472   if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
44473     N0 = NN0;
44474   else {
44475     // The Left side has to be a trunc.
44476     if (N0.getOpcode() != ISD::TRUNCATE)
44477       return SDValue();
44478 
44479     // The type of the truncated inputs.
44480     if (N0.getOperand(0).getValueType() != VT)
44481       return SDValue();
44482 
44483     N0 = N0.getOperand(0);
44484   }
44485 
44486   if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
44487     N1 = NN1;
44488   else {
44489     // The right side has to be a 'trunc' or a constant vector.
44490     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
44491                     N1.getOperand(0).getValueType() == VT;
44492     if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
44493       return SDValue();
44494 
44495     if (RHSTrunc)
44496       N1 = N1.getOperand(0);
44497     else
44498       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
44499   }
44500 
44501   return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
44502 }
44503 
44504 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
44505 // register. In most cases we actually compare or select YMM-sized registers
44506 // and mixing the two types creates horrible code. This method optimizes
44507 // some of the transition sequences.
44508 // Even with AVX-512 this is still useful for removing casts around logical
44509 // operations on vXi1 mask types.
44510 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44511                                      const X86Subtarget &Subtarget) {
44512   EVT VT = N->getValueType(0);
44513   assert(VT.isVector() && "Expected vector type");
44514 
44515   SDLoc DL(N);
44516   assert((N->getOpcode() == ISD::ANY_EXTEND ||
44517           N->getOpcode() == ISD::ZERO_EXTEND ||
44518           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
44519 
44520   SDValue Narrow = N->getOperand(0);
44521   EVT NarrowVT = Narrow.getValueType();
44522 
44523   // Generate the wide operation.
44524   SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
44525   if (!Op)
44526     return SDValue();
44527   switch (N->getOpcode()) {
44528   default: llvm_unreachable("Unexpected opcode");
44529   case ISD::ANY_EXTEND:
44530     return Op;
44531   case ISD::ZERO_EXTEND:
44532     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
44533   case ISD::SIGN_EXTEND:
44534     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
44535                        Op, DAG.getValueType(NarrowVT));
44536   }
44537 }
44538 
44539 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
44540   unsigned FPOpcode;
44541   switch (Opcode) {
44542   default: llvm_unreachable("Unexpected input node for FP logic conversion");
44543   case ISD::AND: FPOpcode = X86ISD::FAND; break;
44544   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
44545   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44546   }
44547   return FPOpcode;
44548 }
44549 
44550 /// If both input operands of a logic op are being cast from floating point
44551 /// types, try to convert this into a floating point logic node to avoid
44552 /// unnecessary moves from SSE to integer registers.
44553 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
44554                                         const X86Subtarget &Subtarget) {
44555   EVT VT = N->getValueType(0);
44556   SDValue N0 = N->getOperand(0);
44557   SDValue N1 = N->getOperand(1);
44558   SDLoc DL(N);
44559 
44560   if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
44561     return SDValue();
44562 
44563   SDValue N00 = N0.getOperand(0);
44564   SDValue N10 = N1.getOperand(0);
44565   EVT N00Type = N00.getValueType();
44566   EVT N10Type = N10.getValueType();
44567 
44568   // Ensure that both types are the same and are legal scalar fp types.
44569   if (N00Type != N10Type ||
44570       !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
44571         (Subtarget.hasSSE2() && N00Type == MVT::f64)))
44572     return SDValue();
44573 
44574   unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
44575   SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
44576   return DAG.getBitcast(VT, FPLogic);
44577 }
44578 
44579 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
44580 // to reduce XMM->GPR traffic.
44581 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
44582   unsigned Opc = N->getOpcode();
44583   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
44584          "Unexpected bit opcode");
44585 
44586   SDValue N0 = N->getOperand(0);
44587   SDValue N1 = N->getOperand(1);
44588 
44589   // Both operands must be single use MOVMSK.
44590   if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
44591       N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
44592     return SDValue();
44593 
44594   SDValue Vec0 = N0.getOperand(0);
44595   SDValue Vec1 = N1.getOperand(0);
44596   EVT VecVT0 = Vec0.getValueType();
44597   EVT VecVT1 = Vec1.getValueType();
44598 
44599   // Both MOVMSK operands must be from vectors of the same size and same element
44600   // size, but its OK for a fp/int diff.
44601   if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
44602       VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
44603     return SDValue();
44604 
44605   SDLoc DL(N);
44606   unsigned VecOpc =
44607       VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
44608   SDValue Result =
44609       DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
44610   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44611 }
44612 
44613 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
44614 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
44615 /// with a shift-right to eliminate loading the vector constant mask value.
44616 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
44617                                      const X86Subtarget &Subtarget) {
44618   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
44619   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
44620   EVT VT0 = Op0.getValueType();
44621   EVT VT1 = Op1.getValueType();
44622 
44623   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
44624     return SDValue();
44625 
44626   APInt SplatVal;
44627   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
44628       !SplatVal.isMask())
44629     return SDValue();
44630 
44631   // Don't prevent creation of ANDN.
44632   if (isBitwiseNot(Op0))
44633     return SDValue();
44634 
44635   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
44636     return SDValue();
44637 
44638   unsigned EltBitWidth = VT0.getScalarSizeInBits();
44639   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
44640     return SDValue();
44641 
44642   SDLoc DL(N);
44643   unsigned ShiftVal = SplatVal.countTrailingOnes();
44644   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
44645   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
44646   return DAG.getBitcast(N->getValueType(0), Shift);
44647 }
44648 
44649 // Get the index node from the lowered DAG of a GEP IR instruction with one
44650 // indexing dimension.
44651 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
44652   if (Ld->isIndexed())
44653     return SDValue();
44654 
44655   SDValue Base = Ld->getBasePtr();
44656 
44657   if (Base.getOpcode() != ISD::ADD)
44658     return SDValue();
44659 
44660   SDValue ShiftedIndex = Base.getOperand(0);
44661 
44662   if (ShiftedIndex.getOpcode() != ISD::SHL)
44663     return SDValue();
44664 
44665   return ShiftedIndex.getOperand(0);
44666 
44667 }
44668 
44669 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
44670   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
44671     switch (VT.getSizeInBits()) {
44672     default: return false;
44673     case 64: return Subtarget.is64Bit() ? true : false;
44674     case 32: return true;
44675     }
44676   }
44677   return false;
44678 }
44679 
44680 // This function recognizes cases where X86 bzhi instruction can replace and
44681 // 'and-load' sequence.
44682 // In case of loading integer value from an array of constants which is defined
44683 // as follows:
44684 //
44685 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
44686 //
44687 // then applying a bitwise and on the result with another input.
44688 // It's equivalent to performing bzhi (zero high bits) on the input, with the
44689 // same index of the load.
44690 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
44691                                     const X86Subtarget &Subtarget) {
44692   MVT VT = Node->getSimpleValueType(0);
44693   SDLoc dl(Node);
44694 
44695   // Check if subtarget has BZHI instruction for the node's type
44696   if (!hasBZHI(Subtarget, VT))
44697     return SDValue();
44698 
44699   // Try matching the pattern for both operands.
44700   for (unsigned i = 0; i < 2; i++) {
44701     SDValue N = Node->getOperand(i);
44702     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
44703 
44704      // continue if the operand is not a load instruction
44705     if (!Ld)
44706       return SDValue();
44707 
44708     const Value *MemOp = Ld->getMemOperand()->getValue();
44709 
44710     if (!MemOp)
44711       return SDValue();
44712 
44713     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
44714       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
44715         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
44716 
44717           Constant *Init = GV->getInitializer();
44718           Type *Ty = Init->getType();
44719           if (!isa<ConstantDataArray>(Init) ||
44720               !Ty->getArrayElementType()->isIntegerTy() ||
44721               Ty->getArrayElementType()->getScalarSizeInBits() !=
44722                   VT.getSizeInBits() ||
44723               Ty->getArrayNumElements() >
44724                   Ty->getArrayElementType()->getScalarSizeInBits())
44725             continue;
44726 
44727           // Check if the array's constant elements are suitable to our case.
44728           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
44729           bool ConstantsMatch = true;
44730           for (uint64_t j = 0; j < ArrayElementCount; j++) {
44731             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
44732             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
44733               ConstantsMatch = false;
44734               break;
44735             }
44736           }
44737           if (!ConstantsMatch)
44738             continue;
44739 
44740           // Do the transformation (For 32-bit type):
44741           // -> (and (load arr[idx]), inp)
44742           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
44743           //    that will be replaced with one bzhi instruction.
44744           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
44745           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
44746 
44747           // Get the Node which indexes into the array.
44748           SDValue Index = getIndexFromUnindexedLoad(Ld);
44749           if (!Index)
44750             return SDValue();
44751           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
44752 
44753           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
44754           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
44755 
44756           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
44757           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
44758 
44759           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
44760         }
44761       }
44762     }
44763   }
44764   return SDValue();
44765 }
44766 
44767 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
44768 // Where C is a mask containing the same number of bits as the setcc and
44769 // where the setcc will freely 0 upper bits of k-register. We can replace the
44770 // undef in the concat with 0s and remove the AND. This mainly helps with
44771 // v2i1/v4i1 setcc being casted to scalar.
44772 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
44773                                              const X86Subtarget &Subtarget) {
44774   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
44775 
44776   EVT VT = N->getValueType(0);
44777 
44778   // Make sure this is an AND with constant. We will check the value of the
44779   // constant later.
44780   if (!isa<ConstantSDNode>(N->getOperand(1)))
44781     return SDValue();
44782 
44783   // This is implied by the ConstantSDNode.
44784   assert(!VT.isVector() && "Expected scalar VT!");
44785 
44786   if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
44787       !N->getOperand(0).hasOneUse() ||
44788       !N->getOperand(0).getOperand(0).hasOneUse())
44789     return SDValue();
44790 
44791   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44792   SDValue Src = N->getOperand(0).getOperand(0);
44793   EVT SrcVT = Src.getValueType();
44794   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
44795       !TLI.isTypeLegal(SrcVT))
44796     return SDValue();
44797 
44798   if (Src.getOpcode() != ISD::CONCAT_VECTORS)
44799     return SDValue();
44800 
44801   // We only care about the first subvector of the concat, we expect the
44802   // other subvectors to be ignored due to the AND if we make the change.
44803   SDValue SubVec = Src.getOperand(0);
44804   EVT SubVecVT = SubVec.getValueType();
44805 
44806   // First subvector should be a setcc with a legal result type. The RHS of the
44807   // AND should be a mask with this many bits.
44808   if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
44809       !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
44810     return SDValue();
44811 
44812   EVT SetccVT = SubVec.getOperand(0).getValueType();
44813   if (!TLI.isTypeLegal(SetccVT) ||
44814       !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
44815     return SDValue();
44816 
44817   if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
44818     return SDValue();
44819 
44820   // We passed all the checks. Rebuild the concat_vectors with zeroes
44821   // and cast it back to VT.
44822   SDLoc dl(N);
44823   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
44824                               DAG.getConstant(0, dl, SubVecVT));
44825   Ops[0] = SubVec;
44826   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
44827                                Ops);
44828   return DAG.getBitcast(VT, Concat);
44829 }
44830 
44831 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
44832                           TargetLowering::DAGCombinerInfo &DCI,
44833                           const X86Subtarget &Subtarget) {
44834   EVT VT = N->getValueType(0);
44835 
44836   // If this is SSE1 only convert to FAND to avoid scalarization.
44837   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44838     return DAG.getBitcast(
44839         MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
44840                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
44841                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
44842   }
44843 
44844   // Use a 32-bit and+zext if upper bits known zero.
44845   if (VT == MVT::i64 && Subtarget.is64Bit() &&
44846       !isa<ConstantSDNode>(N->getOperand(1))) {
44847     APInt HiMask = APInt::getHighBitsSet(64, 32);
44848     if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
44849         DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
44850       SDLoc dl(N);
44851       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
44852       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
44853       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
44854                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
44855     }
44856   }
44857 
44858   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
44859   // TODO: Support multiple SrcOps.
44860   if (VT == MVT::i1) {
44861     SmallVector<SDValue, 2> SrcOps;
44862     SmallVector<APInt, 2> SrcPartials;
44863     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
44864         SrcOps.size() == 1) {
44865       SDLoc dl(N);
44866       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44867       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44868       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44869       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44870       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44871         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44872       if (Mask) {
44873         assert(SrcPartials[0].getBitWidth() == NumElts &&
44874                "Unexpected partial reduction mask");
44875         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44876         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44877         return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
44878       }
44879     }
44880   }
44881 
44882   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
44883     return V;
44884 
44885   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44886     return R;
44887 
44888   if (DCI.isBeforeLegalizeOps())
44889     return SDValue();
44890 
44891   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44892     return R;
44893 
44894   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44895     return FPLogic;
44896 
44897   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
44898     return R;
44899 
44900   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
44901     return ShiftRight;
44902 
44903   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
44904     return R;
44905 
44906   // Attempt to recursively combine a bitmask AND with shuffles.
44907   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44908     SDValue Op(N, 0);
44909     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44910       return Res;
44911   }
44912 
44913   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
44914   if ((VT.getScalarSizeInBits() % 8) == 0 &&
44915       N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44916       isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
44917     SDValue BitMask = N->getOperand(1);
44918     SDValue SrcVec = N->getOperand(0).getOperand(0);
44919     EVT SrcVecVT = SrcVec.getValueType();
44920 
44921     // Check that the constant bitmask masks whole bytes.
44922     APInt UndefElts;
44923     SmallVector<APInt, 64> EltBits;
44924     if (VT == SrcVecVT.getScalarType() &&
44925         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
44926         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
44927         llvm::all_of(EltBits, [](const APInt &M) {
44928           return M.isNullValue() || M.isAllOnesValue();
44929         })) {
44930       unsigned NumElts = SrcVecVT.getVectorNumElements();
44931       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
44932       unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
44933 
44934       // Create a root shuffle mask from the byte mask and the extracted index.
44935       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
44936       for (unsigned i = 0; i != Scale; ++i) {
44937         if (UndefElts[i])
44938           continue;
44939         int VecIdx = Scale * Idx + i;
44940         ShuffleMask[VecIdx] =
44941             EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
44942       }
44943 
44944       if (SDValue Shuffle = combineX86ShufflesRecursively(
44945               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
44946               X86::MaxShuffleCombineDepth,
44947               /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
44948               /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
44949         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
44950                            N->getOperand(0).getOperand(1));
44951     }
44952   }
44953 
44954   return SDValue();
44955 }
44956 
44957 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
44958 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
44959                                      const X86Subtarget &Subtarget) {
44960   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
44961 
44962   MVT VT = N->getSimpleValueType(0);
44963   if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
44964     return SDValue();
44965 
44966   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
44967   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
44968   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
44969     return SDValue();
44970 
44971   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
44972   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
44973   bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
44974                       Subtarget.hasVLX();
44975   if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
44976         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
44977     return SDValue();
44978 
44979   // Attempt to extract constant byte masks.
44980   APInt UndefElts0, UndefElts1;
44981   SmallVector<APInt, 32> EltBits0, EltBits1;
44982   if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
44983                                      false, false))
44984     return SDValue();
44985   if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
44986                                      false, false))
44987     return SDValue();
44988 
44989   for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
44990     // TODO - add UNDEF elts support.
44991     if (UndefElts0[i] || UndefElts1[i])
44992       return SDValue();
44993     if (EltBits0[i] != ~EltBits1[i])
44994       return SDValue();
44995   }
44996 
44997   SDLoc DL(N);
44998 
44999   if (UseVPTERNLOG) {
45000     // Emit a VPTERNLOG node directly.
45001     SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
45002     SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
45003     SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
45004     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
45005     return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
45006   }
45007 
45008   SDValue X = N->getOperand(0);
45009   SDValue Y =
45010       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
45011                   DAG.getBitcast(VT, N1.getOperand(0)));
45012   return DAG.getNode(ISD::OR, DL, VT, X, Y);
45013 }
45014 
45015 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
45016 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
45017   if (N->getOpcode() != ISD::OR)
45018     return false;
45019 
45020   SDValue N0 = N->getOperand(0);
45021   SDValue N1 = N->getOperand(1);
45022 
45023   // Canonicalize AND to LHS.
45024   if (N1.getOpcode() == ISD::AND)
45025     std::swap(N0, N1);
45026 
45027   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
45028   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
45029     return false;
45030 
45031   Mask = N1.getOperand(0);
45032   X = N1.getOperand(1);
45033 
45034   // Check to see if the mask appeared in both the AND and ANDNP.
45035   if (N0.getOperand(0) == Mask)
45036     Y = N0.getOperand(1);
45037   else if (N0.getOperand(1) == Mask)
45038     Y = N0.getOperand(0);
45039   else
45040     return false;
45041 
45042   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
45043   // ANDNP combine allows other combines to happen that prevent matching.
45044   return true;
45045 }
45046 
45047 // Try to fold:
45048 //   (or (and (m, y), (pandn m, x)))
45049 // into:
45050 //   (vselect m, x, y)
45051 // As a special case, try to fold:
45052 //   (or (and (m, (sub 0, x)), (pandn m, x)))
45053 // into:
45054 //   (sub (xor X, M), M)
45055 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
45056                                             const X86Subtarget &Subtarget) {
45057   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
45058 
45059   EVT VT = N->getValueType(0);
45060   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
45061         (VT.is256BitVector() && Subtarget.hasInt256())))
45062     return SDValue();
45063 
45064   SDValue X, Y, Mask;
45065   if (!matchLogicBlend(N, X, Y, Mask))
45066     return SDValue();
45067 
45068   // Validate that X, Y, and Mask are bitcasts, and see through them.
45069   Mask = peekThroughBitcasts(Mask);
45070   X = peekThroughBitcasts(X);
45071   Y = peekThroughBitcasts(Y);
45072 
45073   EVT MaskVT = Mask.getValueType();
45074   unsigned EltBits = MaskVT.getScalarSizeInBits();
45075 
45076   // TODO: Attempt to handle floating point cases as well?
45077   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
45078     return SDValue();
45079 
45080   SDLoc DL(N);
45081 
45082   // Attempt to combine to conditional negate: (sub (xor X, M), M)
45083   if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
45084                                                            DAG, Subtarget))
45085     return Res;
45086 
45087   // PBLENDVB is only available on SSE 4.1.
45088   if (!Subtarget.hasSSE41())
45089     return SDValue();
45090 
45091   // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
45092   if (Subtarget.hasVLX())
45093     return SDValue();
45094 
45095   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
45096 
45097   X = DAG.getBitcast(BlendVT, X);
45098   Y = DAG.getBitcast(BlendVT, Y);
45099   Mask = DAG.getBitcast(BlendVT, Mask);
45100   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
45101   return DAG.getBitcast(VT, Mask);
45102 }
45103 
45104 // Helper function for combineOrCmpEqZeroToCtlzSrl
45105 // Transforms:
45106 //   seteq(cmp x, 0)
45107 //   into:
45108 //   srl(ctlz x), log2(bitsize(x))
45109 // Input pattern is checked by caller.
45110 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
45111                                           SelectionDAG &DAG) {
45112   SDValue Cmp = Op.getOperand(1);
45113   EVT VT = Cmp.getOperand(0).getValueType();
45114   unsigned Log2b = Log2_32(VT.getSizeInBits());
45115   SDLoc dl(Op);
45116   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
45117   // The result of the shift is true or false, and on X86, the 32-bit
45118   // encoding of shr and lzcnt is more desirable.
45119   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
45120   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
45121                             DAG.getConstant(Log2b, dl, MVT::i8));
45122   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
45123 }
45124 
45125 // Try to transform:
45126 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
45127 //   into:
45128 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
45129 // Will also attempt to match more generic cases, eg:
45130 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
45131 // Only applies if the target supports the FastLZCNT feature.
45132 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
45133                                            TargetLowering::DAGCombinerInfo &DCI,
45134                                            const X86Subtarget &Subtarget) {
45135   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
45136     return SDValue();
45137 
45138   auto isORCandidate = [](SDValue N) {
45139     return (N->getOpcode() == ISD::OR && N->hasOneUse());
45140   };
45141 
45142   // Check the zero extend is extending to 32-bit or more. The code generated by
45143   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
45144   // instructions to clear the upper bits.
45145   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
45146       !isORCandidate(N->getOperand(0)))
45147     return SDValue();
45148 
45149   // Check the node matches: setcc(eq, cmp 0)
45150   auto isSetCCCandidate = [](SDValue N) {
45151     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
45152            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
45153            N->getOperand(1).getOpcode() == X86ISD::CMP &&
45154            isNullConstant(N->getOperand(1).getOperand(1)) &&
45155            N->getOperand(1).getValueType().bitsGE(MVT::i32);
45156   };
45157 
45158   SDNode *OR = N->getOperand(0).getNode();
45159   SDValue LHS = OR->getOperand(0);
45160   SDValue RHS = OR->getOperand(1);
45161 
45162   // Save nodes matching or(or, setcc(eq, cmp 0)).
45163   SmallVector<SDNode *, 2> ORNodes;
45164   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
45165           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
45166     ORNodes.push_back(OR);
45167     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
45168     LHS = OR->getOperand(0);
45169     RHS = OR->getOperand(1);
45170   }
45171 
45172   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
45173   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
45174       !isORCandidate(SDValue(OR, 0)))
45175     return SDValue();
45176 
45177   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
45178   // to
45179   // or(srl(ctlz),srl(ctlz)).
45180   // The dag combiner can then fold it into:
45181   // srl(or(ctlz, ctlz)).
45182   EVT VT = OR->getValueType(0);
45183   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
45184   SDValue Ret, NewRHS;
45185   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
45186     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
45187 
45188   if (!Ret)
45189     return SDValue();
45190 
45191   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
45192   while (ORNodes.size() > 0) {
45193     OR = ORNodes.pop_back_val();
45194     LHS = OR->getOperand(0);
45195     RHS = OR->getOperand(1);
45196     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
45197     if (RHS->getOpcode() == ISD::OR)
45198       std::swap(LHS, RHS);
45199     NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
45200     if (!NewRHS)
45201       return SDValue();
45202     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
45203   }
45204 
45205   if (Ret)
45206     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
45207 
45208   return Ret;
45209 }
45210 
45211 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
45212                          TargetLowering::DAGCombinerInfo &DCI,
45213                          const X86Subtarget &Subtarget) {
45214   SDValue N0 = N->getOperand(0);
45215   SDValue N1 = N->getOperand(1);
45216   EVT VT = N->getValueType(0);
45217 
45218   // If this is SSE1 only convert to FOR to avoid scalarization.
45219   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45220     return DAG.getBitcast(MVT::v4i32,
45221                           DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
45222                                       DAG.getBitcast(MVT::v4f32, N0),
45223                                       DAG.getBitcast(MVT::v4f32, N1)));
45224   }
45225 
45226   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
45227   // TODO: Support multiple SrcOps.
45228   if (VT == MVT::i1) {
45229     SmallVector<SDValue, 2> SrcOps;
45230     SmallVector<APInt, 2> SrcPartials;
45231     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
45232         SrcOps.size() == 1) {
45233       SDLoc dl(N);
45234       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45235       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45236       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45237       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45238       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45239         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45240       if (Mask) {
45241         assert(SrcPartials[0].getBitWidth() == NumElts &&
45242                "Unexpected partial reduction mask");
45243         SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
45244         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45245         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45246         return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
45247       }
45248     }
45249   }
45250 
45251   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45252     return R;
45253 
45254   if (DCI.isBeforeLegalizeOps())
45255     return SDValue();
45256 
45257   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45258     return R;
45259 
45260   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45261     return FPLogic;
45262 
45263   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
45264     return R;
45265 
45266   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
45267     return R;
45268 
45269   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
45270   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
45271   // iff the upper elements of the non-shifted arg are zero.
45272   // KUNPCK require 16+ bool vector elements.
45273   if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
45274     unsigned NumElts = VT.getVectorNumElements();
45275     unsigned HalfElts = NumElts / 2;
45276     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
45277     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
45278         N1.getConstantOperandAPInt(1) == HalfElts &&
45279         DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
45280       SDLoc dl(N);
45281       return DAG.getNode(
45282           ISD::CONCAT_VECTORS, dl, VT,
45283           extractSubVector(N0, 0, DAG, dl, HalfElts),
45284           extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
45285     }
45286     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
45287         N0.getConstantOperandAPInt(1) == HalfElts &&
45288         DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
45289       SDLoc dl(N);
45290       return DAG.getNode(
45291           ISD::CONCAT_VECTORS, dl, VT,
45292           extractSubVector(N1, 0, DAG, dl, HalfElts),
45293           extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
45294     }
45295   }
45296 
45297   // Attempt to recursively combine an OR of shuffles.
45298   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45299     SDValue Op(N, 0);
45300     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45301       return Res;
45302   }
45303 
45304   return SDValue();
45305 }
45306 
45307 /// Try to turn tests against the signbit in the form of:
45308 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
45309 /// into:
45310 ///   SETGT(X, -1)
45311 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
45312   // This is only worth doing if the output type is i8 or i1.
45313   EVT ResultType = N->getValueType(0);
45314   if (ResultType != MVT::i8 && ResultType != MVT::i1)
45315     return SDValue();
45316 
45317   SDValue N0 = N->getOperand(0);
45318   SDValue N1 = N->getOperand(1);
45319 
45320   // We should be performing an xor against a truncated shift.
45321   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
45322     return SDValue();
45323 
45324   // Make sure we are performing an xor against one.
45325   if (!isOneConstant(N1))
45326     return SDValue();
45327 
45328   // SetCC on x86 zero extends so only act on this if it's a logical shift.
45329   SDValue Shift = N0.getOperand(0);
45330   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
45331     return SDValue();
45332 
45333   // Make sure we are truncating from one of i16, i32 or i64.
45334   EVT ShiftTy = Shift.getValueType();
45335   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
45336     return SDValue();
45337 
45338   // Make sure the shift amount extracts the sign bit.
45339   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
45340       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
45341     return SDValue();
45342 
45343   // Create a greater-than comparison against -1.
45344   // N.B. Using SETGE against 0 works but we want a canonical looking
45345   // comparison, using SETGT matches up with what TranslateX86CC.
45346   SDLoc DL(N);
45347   SDValue ShiftOp = Shift.getOperand(0);
45348   EVT ShiftOpTy = ShiftOp.getValueType();
45349   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45350   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
45351                                                *DAG.getContext(), ResultType);
45352   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
45353                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
45354   if (SetCCResultType != ResultType)
45355     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
45356   return Cond;
45357 }
45358 
45359 /// Turn vector tests of the signbit in the form of:
45360 ///   xor (sra X, elt_size(X)-1), -1
45361 /// into:
45362 ///   pcmpgt X, -1
45363 ///
45364 /// This should be called before type legalization because the pattern may not
45365 /// persist after that.
45366 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
45367                                          const X86Subtarget &Subtarget) {
45368   EVT VT = N->getValueType(0);
45369   if (!VT.isSimple())
45370     return SDValue();
45371 
45372   switch (VT.getSimpleVT().SimpleTy) {
45373   default: return SDValue();
45374   case MVT::v16i8:
45375   case MVT::v8i16:
45376   case MVT::v4i32:
45377   case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
45378   case MVT::v32i8:
45379   case MVT::v16i16:
45380   case MVT::v8i32:
45381   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
45382   }
45383 
45384   // There must be a shift right algebraic before the xor, and the xor must be a
45385   // 'not' operation.
45386   SDValue Shift = N->getOperand(0);
45387   SDValue Ones = N->getOperand(1);
45388   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
45389       !ISD::isBuildVectorAllOnes(Ones.getNode()))
45390     return SDValue();
45391 
45392   // The shift should be smearing the sign bit across each vector element.
45393   auto *ShiftAmt =
45394       isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
45395   if (!ShiftAmt ||
45396       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
45397     return SDValue();
45398 
45399   // Create a greater-than comparison against -1. We don't use the more obvious
45400   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
45401   return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
45402 }
45403 
45404 /// Detect patterns of truncation with unsigned saturation:
45405 ///
45406 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
45407 ///   Return the source value x to be truncated or SDValue() if the pattern was
45408 ///   not matched.
45409 ///
45410 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
45411 ///   where C1 >= 0 and C2 is unsigned max of destination type.
45412 ///
45413 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
45414 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
45415 ///
45416 ///   These two patterns are equivalent to:
45417 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
45418 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
45419 ///   pattern was not matched.
45420 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45421                                  const SDLoc &DL) {
45422   EVT InVT = In.getValueType();
45423 
45424   // Saturation with truncation. We truncate from InVT to VT.
45425   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
45426          "Unexpected types for truncate operation");
45427 
45428   // Match min/max and return limit value as a parameter.
45429   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
45430     if (V.getOpcode() == Opcode &&
45431         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
45432       return V.getOperand(0);
45433     return SDValue();
45434   };
45435 
45436   APInt C1, C2;
45437   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
45438     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
45439     // the element size of the destination type.
45440     if (C2.isMask(VT.getScalarSizeInBits()))
45441       return UMin;
45442 
45443   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
45444     if (MatchMinMax(SMin, ISD::SMAX, C1))
45445       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
45446         return SMin;
45447 
45448   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
45449     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
45450       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
45451           C2.uge(C1)) {
45452         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
45453       }
45454 
45455   return SDValue();
45456 }
45457 
45458 /// Detect patterns of truncation with signed saturation:
45459 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
45460 ///                  signed_max_of_dest_type)) to dest_type)
45461 /// or:
45462 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
45463 ///                  signed_min_of_dest_type)) to dest_type).
45464 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
45465 /// Return the source value to be truncated or SDValue() if the pattern was not
45466 /// matched.
45467 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
45468   unsigned NumDstBits = VT.getScalarSizeInBits();
45469   unsigned NumSrcBits = In.getScalarValueSizeInBits();
45470   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
45471 
45472   auto MatchMinMax = [](SDValue V, unsigned Opcode,
45473                         const APInt &Limit) -> SDValue {
45474     APInt C;
45475     if (V.getOpcode() == Opcode &&
45476         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
45477       return V.getOperand(0);
45478     return SDValue();
45479   };
45480 
45481   APInt SignedMax, SignedMin;
45482   if (MatchPackUS) {
45483     SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
45484     SignedMin = APInt(NumSrcBits, 0);
45485   } else {
45486     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
45487     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
45488   }
45489 
45490   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
45491     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
45492       return SMax;
45493 
45494   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
45495     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
45496       return SMin;
45497 
45498   return SDValue();
45499 }
45500 
45501 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
45502                                       SelectionDAG &DAG,
45503                                       const X86Subtarget &Subtarget) {
45504   if (!Subtarget.hasSSE2() || !VT.isVector())
45505     return SDValue();
45506 
45507   EVT SVT = VT.getVectorElementType();
45508   EVT InVT = In.getValueType();
45509   EVT InSVT = InVT.getVectorElementType();
45510 
45511   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
45512   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
45513   // and concatenate at the same time. Then we can use a final vpmovuswb to
45514   // clip to 0-255.
45515   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
45516       InVT == MVT::v16i32 && VT == MVT::v16i8) {
45517     if (auto USatVal = detectSSatPattern(In, VT, true)) {
45518       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
45519       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
45520                                            DL, DAG, Subtarget);
45521       assert(Mid && "Failed to pack!");
45522       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
45523     }
45524   }
45525 
45526   // vXi32 truncate instructions are available with AVX512F.
45527   // vXi16 truncate instructions are only available with AVX512BW.
45528   // For 256-bit or smaller vectors, we require VLX.
45529   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
45530   // If the result type is 256-bits or larger and we have disable 512-bit
45531   // registers, we should go ahead and use the pack instructions if possible.
45532   bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
45533                        (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
45534                       (InVT.getSizeInBits() > 128) &&
45535                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
45536                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
45537 
45538   if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
45539       VT.getSizeInBits() >= 64 &&
45540       (SVT == MVT::i8 || SVT == MVT::i16) &&
45541       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
45542     if (auto USatVal = detectSSatPattern(In, VT, true)) {
45543       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
45544       // Only do this when the result is at least 64 bits or we'll leaving
45545       // dangling PACKSSDW nodes.
45546       if (SVT == MVT::i8 && InSVT == MVT::i32) {
45547         EVT MidVT = VT.changeVectorElementType(MVT::i16);
45548         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
45549                                              DAG, Subtarget);
45550         assert(Mid && "Failed to pack!");
45551         SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
45552                                            Subtarget);
45553         assert(V && "Failed to pack!");
45554         return V;
45555       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
45556         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
45557                                       Subtarget);
45558     }
45559     if (auto SSatVal = detectSSatPattern(In, VT))
45560       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
45561                                     Subtarget);
45562   }
45563 
45564   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45565   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
45566       Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
45567       (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
45568     unsigned TruncOpc = 0;
45569     SDValue SatVal;
45570     if (auto SSatVal = detectSSatPattern(In, VT)) {
45571       SatVal = SSatVal;
45572       TruncOpc = X86ISD::VTRUNCS;
45573     } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
45574       SatVal = USatVal;
45575       TruncOpc = X86ISD::VTRUNCUS;
45576     }
45577     if (SatVal) {
45578       unsigned ResElts = VT.getVectorNumElements();
45579       // If the input type is less than 512 bits and we don't have VLX, we need
45580       // to widen to 512 bits.
45581       if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
45582         unsigned NumConcats = 512 / InVT.getSizeInBits();
45583         ResElts *= NumConcats;
45584         SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
45585         ConcatOps[0] = SatVal;
45586         InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
45587                                 NumConcats * InVT.getVectorNumElements());
45588         SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
45589       }
45590       // Widen the result if its narrower than 128 bits.
45591       if (ResElts * SVT.getSizeInBits() < 128)
45592         ResElts = 128 / SVT.getSizeInBits();
45593       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
45594       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
45595       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45596                          DAG.getIntPtrConstant(0, DL));
45597     }
45598   }
45599 
45600   return SDValue();
45601 }
45602 
45603 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
45604 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
45605 /// X86ISD::AVG instruction.
45606 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45607                                 const X86Subtarget &Subtarget,
45608                                 const SDLoc &DL) {
45609   if (!VT.isVector())
45610     return SDValue();
45611   EVT InVT = In.getValueType();
45612   unsigned NumElems = VT.getVectorNumElements();
45613 
45614   EVT ScalarVT = VT.getVectorElementType();
45615   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
45616     return SDValue();
45617 
45618   // InScalarVT is the intermediate type in AVG pattern and it should be greater
45619   // than the original input type (i8/i16).
45620   EVT InScalarVT = InVT.getVectorElementType();
45621   if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
45622     return SDValue();
45623 
45624   if (!Subtarget.hasSSE2())
45625     return SDValue();
45626 
45627   // Detect the following pattern:
45628   //
45629   //   %1 = zext <N x i8> %a to <N x i32>
45630   //   %2 = zext <N x i8> %b to <N x i32>
45631   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
45632   //   %4 = add nuw nsw <N x i32> %3, %2
45633   //   %5 = lshr <N x i32> %N, <i32 1 x N>
45634   //   %6 = trunc <N x i32> %5 to <N x i8>
45635   //
45636   // In AVX512, the last instruction can also be a trunc store.
45637   if (In.getOpcode() != ISD::SRL)
45638     return SDValue();
45639 
45640   // A lambda checking the given SDValue is a constant vector and each element
45641   // is in the range [Min, Max].
45642   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
45643     return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
45644       return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
45645     });
45646   };
45647 
45648   // Check if each element of the vector is right-shifted by one.
45649   SDValue LHS = In.getOperand(0);
45650   SDValue RHS = In.getOperand(1);
45651   if (!IsConstVectorInRange(RHS, 1, 1))
45652     return SDValue();
45653   if (LHS.getOpcode() != ISD::ADD)
45654     return SDValue();
45655 
45656   // Detect a pattern of a + b + 1 where the order doesn't matter.
45657   SDValue Operands[3];
45658   Operands[0] = LHS.getOperand(0);
45659   Operands[1] = LHS.getOperand(1);
45660 
45661   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45662                        ArrayRef<SDValue> Ops) {
45663     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
45664   };
45665 
45666   auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
45667     // Pad to a power-of-2 vector, split+apply and extract the original vector.
45668     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
45669     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
45670     if (NumElemsPow2 != NumElems) {
45671       SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45672       SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45673       for (unsigned i = 0; i != NumElems; ++i) {
45674         SDValue Idx = DAG.getIntPtrConstant(i, DL);
45675         Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
45676         Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
45677       }
45678       Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
45679       Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
45680     }
45681     SDValue Res =
45682         SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
45683     if (NumElemsPow2 == NumElems)
45684       return Res;
45685     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45686                        DAG.getIntPtrConstant(0, DL));
45687   };
45688 
45689   // Take care of the case when one of the operands is a constant vector whose
45690   // element is in the range [1, 256].
45691   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
45692       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
45693       Operands[0].getOperand(0).getValueType() == VT) {
45694     // The pattern is detected. Subtract one from the constant vector, then
45695     // demote it and emit X86ISD::AVG instruction.
45696     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
45697     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
45698     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
45699     return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
45700   }
45701 
45702   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
45703   // Match the or case only if its 'add-like' - can be replaced by an add.
45704   auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
45705     if (ISD::ADD == V.getOpcode()) {
45706       Op0 = V.getOperand(0);
45707       Op1 = V.getOperand(1);
45708       return true;
45709     }
45710     if (ISD::ZERO_EXTEND != V.getOpcode())
45711       return false;
45712     V = V.getOperand(0);
45713     if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
45714         !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
45715       return false;
45716     Op0 = V.getOperand(0);
45717     Op1 = V.getOperand(1);
45718     return true;
45719   };
45720 
45721   SDValue Op0, Op1;
45722   if (FindAddLike(Operands[0], Op0, Op1))
45723     std::swap(Operands[0], Operands[1]);
45724   else if (!FindAddLike(Operands[1], Op0, Op1))
45725     return SDValue();
45726   Operands[2] = Op0;
45727   Operands[1] = Op1;
45728 
45729   // Now we have three operands of two additions. Check that one of them is a
45730   // constant vector with ones, and the other two can be promoted from i8/i16.
45731   for (int i = 0; i < 3; ++i) {
45732     if (!IsConstVectorInRange(Operands[i], 1, 1))
45733       continue;
45734     std::swap(Operands[i], Operands[2]);
45735 
45736     // Check if Operands[0] and Operands[1] are results of type promotion.
45737     for (int j = 0; j < 2; ++j)
45738       if (Operands[j].getValueType() != VT) {
45739         if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
45740             Operands[j].getOperand(0).getValueType() != VT)
45741           return SDValue();
45742         Operands[j] = Operands[j].getOperand(0);
45743       }
45744 
45745     // The pattern is detected, emit X86ISD::AVG instruction(s).
45746     return AVGSplitter(Operands[0], Operands[1]);
45747   }
45748 
45749   return SDValue();
45750 }
45751 
45752 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
45753                            TargetLowering::DAGCombinerInfo &DCI,
45754                            const X86Subtarget &Subtarget) {
45755   LoadSDNode *Ld = cast<LoadSDNode>(N);
45756   EVT RegVT = Ld->getValueType(0);
45757   EVT MemVT = Ld->getMemoryVT();
45758   SDLoc dl(Ld);
45759   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45760 
45761   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
45762   // into two 16-byte operations. Also split non-temporal aligned loads on
45763   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
45764   ISD::LoadExtType Ext = Ld->getExtensionType();
45765   bool Fast;
45766   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
45767       Ext == ISD::NON_EXTLOAD &&
45768       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
45769         Ld->getAlignment() >= 16) ||
45770        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
45771                                *Ld->getMemOperand(), &Fast) &&
45772         !Fast))) {
45773     unsigned NumElems = RegVT.getVectorNumElements();
45774     if (NumElems < 2)
45775       return SDValue();
45776 
45777     unsigned HalfOffset = 16;
45778     SDValue Ptr1 = Ld->getBasePtr();
45779     SDValue Ptr2 =
45780         DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
45781     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
45782                                   NumElems / 2);
45783     SDValue Load1 =
45784         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
45785                     Ld->getOriginalAlign(),
45786                     Ld->getMemOperand()->getFlags());
45787     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
45788                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
45789                                 Ld->getOriginalAlign(),
45790                                 Ld->getMemOperand()->getFlags());
45791     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
45792                              Load1.getValue(1), Load2.getValue(1));
45793 
45794     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
45795     return DCI.CombineTo(N, NewVec, TF, true);
45796   }
45797 
45798   // Bool vector load - attempt to cast to an integer, as we have good
45799   // (vXiY *ext(vXi1 bitcast(iX))) handling.
45800   if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
45801       RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
45802     unsigned NumElts = RegVT.getVectorNumElements();
45803     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45804     if (TLI.isTypeLegal(IntVT)) {
45805       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
45806                                     Ld->getPointerInfo(),
45807                                     Ld->getOriginalAlign(),
45808                                     Ld->getMemOperand()->getFlags());
45809       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
45810       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
45811     }
45812   }
45813 
45814   // If we also broadcast this as a subvector to a wider type, then just extract
45815   // the lowest subvector.
45816   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
45817       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
45818     SDValue Ptr = Ld->getBasePtr();
45819     SDValue Chain = Ld->getChain();
45820     for (SDNode *User : Ptr->uses()) {
45821       if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
45822           cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
45823           cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
45824           cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
45825               MemVT.getSizeInBits() &&
45826           !User->hasAnyUseOfValue(1) &&
45827           User->getValueSizeInBits(0).getFixedSize() >
45828               RegVT.getFixedSizeInBits()) {
45829         SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
45830                                            RegVT.getSizeInBits());
45831         Extract = DAG.getBitcast(RegVT, Extract);
45832         return DCI.CombineTo(N, Extract, SDValue(User, 1));
45833       }
45834     }
45835   }
45836 
45837   // Cast ptr32 and ptr64 pointers to the default address space before a load.
45838   unsigned AddrSpace = Ld->getAddressSpace();
45839   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45840       AddrSpace == X86AS::PTR32_UPTR) {
45841     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45842     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
45843       SDValue Cast =
45844           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
45845       return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
45846                          Ld->getOriginalAlign(),
45847                          Ld->getMemOperand()->getFlags());
45848     }
45849   }
45850 
45851   return SDValue();
45852 }
45853 
45854 /// If V is a build vector of boolean constants and exactly one of those
45855 /// constants is true, return the operand index of that true element.
45856 /// Otherwise, return -1.
45857 static int getOneTrueElt(SDValue V) {
45858   // This needs to be a build vector of booleans.
45859   // TODO: Checking for the i1 type matches the IR definition for the mask,
45860   // but the mask check could be loosened to i8 or other types. That might
45861   // also require checking more than 'allOnesValue'; eg, the x86 HW
45862   // instructions only require that the MSB is set for each mask element.
45863   // The ISD::MSTORE comments/definition do not specify how the mask operand
45864   // is formatted.
45865   auto *BV = dyn_cast<BuildVectorSDNode>(V);
45866   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
45867     return -1;
45868 
45869   int TrueIndex = -1;
45870   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
45871   for (unsigned i = 0; i < NumElts; ++i) {
45872     const SDValue &Op = BV->getOperand(i);
45873     if (Op.isUndef())
45874       continue;
45875     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
45876     if (!ConstNode)
45877       return -1;
45878     if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
45879       // If we already found a one, this is too many.
45880       if (TrueIndex >= 0)
45881         return -1;
45882       TrueIndex = i;
45883     }
45884   }
45885   return TrueIndex;
45886 }
45887 
45888 /// Given a masked memory load/store operation, return true if it has one mask
45889 /// bit set. If it has one mask bit set, then also return the memory address of
45890 /// the scalar element to load/store, the vector index to insert/extract that
45891 /// scalar element, and the alignment for the scalar memory access.
45892 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
45893                                          SelectionDAG &DAG, SDValue &Addr,
45894                                          SDValue &Index, Align &Alignment,
45895                                          unsigned &Offset) {
45896   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
45897   if (TrueMaskElt < 0)
45898     return false;
45899 
45900   // Get the address of the one scalar element that is specified by the mask
45901   // using the appropriate offset from the base pointer.
45902   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
45903   Offset = 0;
45904   Addr = MaskedOp->getBasePtr();
45905   if (TrueMaskElt != 0) {
45906     Offset = TrueMaskElt * EltVT.getStoreSize();
45907     Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
45908                                     SDLoc(MaskedOp));
45909   }
45910 
45911   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
45912   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
45913                               EltVT.getStoreSize());
45914   return true;
45915 }
45916 
45917 /// If exactly one element of the mask is set for a non-extending masked load,
45918 /// it is a scalar load and vector insert.
45919 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45920 /// mask have already been optimized in IR, so we don't bother with those here.
45921 static SDValue
45922 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45923                              TargetLowering::DAGCombinerInfo &DCI,
45924                              const X86Subtarget &Subtarget) {
45925   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45926   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45927   // However, some target hooks may need to be added to know when the transform
45928   // is profitable. Endianness would also have to be considered.
45929 
45930   SDValue Addr, VecIndex;
45931   Align Alignment;
45932   unsigned Offset;
45933   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
45934     return SDValue();
45935 
45936   // Load the one scalar element that is specified by the mask using the
45937   // appropriate offset from the base pointer.
45938   SDLoc DL(ML);
45939   EVT VT = ML->getValueType(0);
45940   EVT EltVT = VT.getVectorElementType();
45941 
45942   EVT CastVT = VT;
45943   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45944     EltVT = MVT::f64;
45945     CastVT = VT.changeVectorElementType(EltVT);
45946   }
45947 
45948   SDValue Load =
45949       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
45950                   ML->getPointerInfo().getWithOffset(Offset),
45951                   Alignment, ML->getMemOperand()->getFlags());
45952 
45953   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
45954 
45955   // Insert the loaded element into the appropriate place in the vector.
45956   SDValue Insert =
45957       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
45958   Insert = DAG.getBitcast(VT, Insert);
45959   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
45960 }
45961 
45962 static SDValue
45963 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45964                               TargetLowering::DAGCombinerInfo &DCI) {
45965   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
45966   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
45967     return SDValue();
45968 
45969   SDLoc DL(ML);
45970   EVT VT = ML->getValueType(0);
45971 
45972   // If we are loading the first and last elements of a vector, it is safe and
45973   // always faster to load the whole vector. Replace the masked load with a
45974   // vector load and select.
45975   unsigned NumElts = VT.getVectorNumElements();
45976   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
45977   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
45978   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
45979   if (LoadFirstElt && LoadLastElt) {
45980     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
45981                                 ML->getMemOperand());
45982     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
45983                                   ML->getPassThru());
45984     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
45985   }
45986 
45987   // Convert a masked load with a constant mask into a masked load and a select.
45988   // This allows the select operation to use a faster kind of select instruction
45989   // (for example, vblendvps -> vblendps).
45990 
45991   // Don't try this if the pass-through operand is already undefined. That would
45992   // cause an infinite loop because that's what we're about to create.
45993   if (ML->getPassThru().isUndef())
45994     return SDValue();
45995 
45996   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
45997     return SDValue();
45998 
45999   // The new masked load has an undef pass-through operand. The select uses the
46000   // original pass-through operand.
46001   SDValue NewML = DAG.getMaskedLoad(
46002       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
46003       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
46004       ML->getAddressingMode(), ML->getExtensionType());
46005   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
46006                                 ML->getPassThru());
46007 
46008   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
46009 }
46010 
46011 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
46012                                  TargetLowering::DAGCombinerInfo &DCI,
46013                                  const X86Subtarget &Subtarget) {
46014   auto *Mld = cast<MaskedLoadSDNode>(N);
46015 
46016   // TODO: Expanding load with constant mask may be optimized as well.
46017   if (Mld->isExpandingLoad())
46018     return SDValue();
46019 
46020   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
46021     if (SDValue ScalarLoad =
46022             reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
46023       return ScalarLoad;
46024 
46025     // TODO: Do some AVX512 subsets benefit from this transform?
46026     if (!Subtarget.hasAVX512())
46027       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
46028         return Blend;
46029   }
46030 
46031   // If the mask value has been legalized to a non-boolean vector, try to
46032   // simplify ops leading up to it. We only demand the MSB of each lane.
46033   SDValue Mask = Mld->getMask();
46034   if (Mask.getScalarValueSizeInBits() != 1) {
46035     EVT VT = Mld->getValueType(0);
46036     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46037     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46038     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46039       if (N->getOpcode() != ISD::DELETED_NODE)
46040         DCI.AddToWorklist(N);
46041       return SDValue(N, 0);
46042     }
46043     if (SDValue NewMask =
46044             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46045       return DAG.getMaskedLoad(
46046           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
46047           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
46048           Mld->getAddressingMode(), Mld->getExtensionType());
46049   }
46050 
46051   return SDValue();
46052 }
46053 
46054 /// If exactly one element of the mask is set for a non-truncating masked store,
46055 /// it is a vector extract and scalar store.
46056 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46057 /// mask have already been optimized in IR, so we don't bother with those here.
46058 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
46059                                               SelectionDAG &DAG,
46060                                               const X86Subtarget &Subtarget) {
46061   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46062   // However, some target hooks may need to be added to know when the transform
46063   // is profitable. Endianness would also have to be considered.
46064 
46065   SDValue Addr, VecIndex;
46066   Align Alignment;
46067   unsigned Offset;
46068   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
46069     return SDValue();
46070 
46071   // Extract the one scalar element that is actually being stored.
46072   SDLoc DL(MS);
46073   SDValue Value = MS->getValue();
46074   EVT VT = Value.getValueType();
46075   EVT EltVT = VT.getVectorElementType();
46076   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46077     EltVT = MVT::f64;
46078     EVT CastVT = VT.changeVectorElementType(EltVT);
46079     Value = DAG.getBitcast(CastVT, Value);
46080   }
46081   SDValue Extract =
46082       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
46083 
46084   // Store that element at the appropriate offset from the base pointer.
46085   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
46086                       MS->getPointerInfo().getWithOffset(Offset),
46087                       Alignment, MS->getMemOperand()->getFlags());
46088 }
46089 
46090 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
46091                                   TargetLowering::DAGCombinerInfo &DCI,
46092                                   const X86Subtarget &Subtarget) {
46093   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
46094   if (Mst->isCompressingStore())
46095     return SDValue();
46096 
46097   EVT VT = Mst->getValue().getValueType();
46098   SDLoc dl(Mst);
46099   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46100 
46101   if (Mst->isTruncatingStore())
46102     return SDValue();
46103 
46104   if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
46105     return ScalarStore;
46106 
46107   // If the mask value has been legalized to a non-boolean vector, try to
46108   // simplify ops leading up to it. We only demand the MSB of each lane.
46109   SDValue Mask = Mst->getMask();
46110   if (Mask.getScalarValueSizeInBits() != 1) {
46111     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46112     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46113       if (N->getOpcode() != ISD::DELETED_NODE)
46114         DCI.AddToWorklist(N);
46115       return SDValue(N, 0);
46116     }
46117     if (SDValue NewMask =
46118             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46119       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
46120                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
46121                                 Mst->getMemoryVT(), Mst->getMemOperand(),
46122                                 Mst->getAddressingMode());
46123   }
46124 
46125   SDValue Value = Mst->getValue();
46126   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
46127       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
46128                             Mst->getMemoryVT())) {
46129     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
46130                               Mst->getBasePtr(), Mst->getOffset(), Mask,
46131                               Mst->getMemoryVT(), Mst->getMemOperand(),
46132                               Mst->getAddressingMode(), true);
46133   }
46134 
46135   return SDValue();
46136 }
46137 
46138 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
46139                             TargetLowering::DAGCombinerInfo &DCI,
46140                             const X86Subtarget &Subtarget) {
46141   StoreSDNode *St = cast<StoreSDNode>(N);
46142   EVT StVT = St->getMemoryVT();
46143   SDLoc dl(St);
46144   SDValue StoredVal = St->getValue();
46145   EVT VT = StoredVal.getValueType();
46146   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46147 
46148   // Convert a store of vXi1 into a store of iX and a bitcast.
46149   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
46150       VT.getVectorElementType() == MVT::i1) {
46151 
46152     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46153     StoredVal = DAG.getBitcast(NewVT, StoredVal);
46154 
46155     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46156                         St->getPointerInfo(), St->getOriginalAlign(),
46157                         St->getMemOperand()->getFlags());
46158   }
46159 
46160   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
46161   // This will avoid a copy to k-register.
46162   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
46163       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46164       StoredVal.getOperand(0).getValueType() == MVT::i8) {
46165     SDValue Val = StoredVal.getOperand(0);
46166     // We must store zeros to the unused bits.
46167     Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
46168     return DAG.getStore(St->getChain(), dl, Val,
46169                         St->getBasePtr(), St->getPointerInfo(),
46170                         St->getOriginalAlign(),
46171                         St->getMemOperand()->getFlags());
46172   }
46173 
46174   // Widen v2i1/v4i1 stores to v8i1.
46175   if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
46176       Subtarget.hasAVX512()) {
46177     unsigned NumConcats = 8 / VT.getVectorNumElements();
46178     // We must store zeros to the unused bits.
46179     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
46180     Ops[0] = StoredVal;
46181     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
46182     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46183                         St->getPointerInfo(), St->getOriginalAlign(),
46184                         St->getMemOperand()->getFlags());
46185   }
46186 
46187   // Turn vXi1 stores of constants into a scalar store.
46188   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
46189        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
46190       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
46191     // If its a v64i1 store without 64-bit support, we need two stores.
46192     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
46193       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
46194                                       StoredVal->ops().slice(0, 32));
46195       Lo = combinevXi1ConstantToInteger(Lo, DAG);
46196       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
46197                                       StoredVal->ops().slice(32, 32));
46198       Hi = combinevXi1ConstantToInteger(Hi, DAG);
46199 
46200       SDValue Ptr0 = St->getBasePtr();
46201       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
46202 
46203       SDValue Ch0 =
46204           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
46205                        St->getOriginalAlign(),
46206                        St->getMemOperand()->getFlags());
46207       SDValue Ch1 =
46208           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
46209                        St->getPointerInfo().getWithOffset(4),
46210                        St->getOriginalAlign(),
46211                        St->getMemOperand()->getFlags());
46212       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
46213     }
46214 
46215     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
46216     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46217                         St->getPointerInfo(), St->getOriginalAlign(),
46218                         St->getMemOperand()->getFlags());
46219   }
46220 
46221   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
46222   // Sandy Bridge, perform two 16-byte stores.
46223   bool Fast;
46224   if (VT.is256BitVector() && StVT == VT &&
46225       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46226                              *St->getMemOperand(), &Fast) &&
46227       !Fast) {
46228     unsigned NumElems = VT.getVectorNumElements();
46229     if (NumElems < 2)
46230       return SDValue();
46231 
46232     return splitVectorStore(St, DAG);
46233   }
46234 
46235   // Split under-aligned vector non-temporal stores.
46236   if (St->isNonTemporal() && StVT == VT &&
46237       St->getAlignment() < VT.getStoreSize()) {
46238     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
46239     // vectors or the legalizer can scalarize it to use MOVNTI.
46240     if (VT.is256BitVector() || VT.is512BitVector()) {
46241       unsigned NumElems = VT.getVectorNumElements();
46242       if (NumElems < 2)
46243         return SDValue();
46244       return splitVectorStore(St, DAG);
46245     }
46246 
46247     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
46248     // to use MOVNTI.
46249     if (VT.is128BitVector() && Subtarget.hasSSE2()) {
46250       MVT NTVT = Subtarget.hasSSE4A()
46251                      ? MVT::v2f64
46252                      : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
46253       return scalarizeVectorStore(St, NTVT, DAG);
46254     }
46255   }
46256 
46257   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
46258   // supported, but avx512f is by extending to v16i32 and truncating.
46259   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
46260       St->getValue().getOpcode() == ISD::TRUNCATE &&
46261       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
46262       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
46263       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
46264     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
46265     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
46266                              MVT::v16i8, St->getMemOperand());
46267   }
46268 
46269   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
46270   if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
46271       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
46272        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
46273       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
46274     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
46275     return EmitTruncSStore(IsSigned, St->getChain(),
46276                            dl, StoredVal.getOperand(0), St->getBasePtr(),
46277                            VT, St->getMemOperand(), DAG);
46278   }
46279 
46280   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
46281   if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
46282     auto IsExtractedElement = [](SDValue V) {
46283       if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
46284         V = V.getOperand(0);
46285       unsigned Opc = V.getOpcode();
46286       if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
46287         if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
46288           return V.getOperand(0);
46289       }
46290       return SDValue();
46291     };
46292     if (SDValue Extract = IsExtractedElement(StoredVal)) {
46293       SDValue Trunc = peekThroughOneUseBitcasts(Extract);
46294       if (Trunc.getOpcode() == X86ISD::VTRUNC) {
46295         SDValue Src = Trunc.getOperand(0);
46296         MVT DstVT = Trunc.getSimpleValueType();
46297         MVT SrcVT = Src.getSimpleValueType();
46298         unsigned NumSrcElts = SrcVT.getVectorNumElements();
46299         unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
46300         MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
46301         if (NumTruncBits == VT.getSizeInBits() &&
46302             TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
46303           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
46304                                    TruncVT, St->getMemOperand());
46305         }
46306       }
46307     }
46308   }
46309 
46310   // Optimize trunc store (of multiple scalars) to shuffle and store.
46311   // First, pack all of the elements in one place. Next, store to memory
46312   // in fewer chunks.
46313   if (St->isTruncatingStore() && VT.isVector()) {
46314     // Check if we can detect an AVG pattern from the truncation. If yes,
46315     // replace the trunc store by a normal store with the result of X86ISD::AVG
46316     // instruction.
46317     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
46318       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
46319                                          Subtarget, dl))
46320         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
46321                             St->getPointerInfo(), St->getOriginalAlign(),
46322                             St->getMemOperand()->getFlags());
46323 
46324     if (TLI.isTruncStoreLegal(VT, StVT)) {
46325       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
46326         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
46327                                dl, Val, St->getBasePtr(),
46328                                St->getMemoryVT(), St->getMemOperand(), DAG);
46329       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
46330                                           DAG, dl))
46331         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
46332                                dl, Val, St->getBasePtr(),
46333                                St->getMemoryVT(), St->getMemOperand(), DAG);
46334     }
46335 
46336     return SDValue();
46337   }
46338 
46339   // Cast ptr32 and ptr64 pointers to the default address space before a store.
46340   unsigned AddrSpace = St->getAddressSpace();
46341   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46342       AddrSpace == X86AS::PTR32_UPTR) {
46343     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46344     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
46345       SDValue Cast =
46346           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
46347       return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
46348                           St->getPointerInfo(), St->getOriginalAlign(),
46349                           St->getMemOperand()->getFlags(), St->getAAInfo());
46350     }
46351   }
46352 
46353   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
46354   // the FP state in cases where an emms may be missing.
46355   // A preferable solution to the general problem is to figure out the right
46356   // places to insert EMMS.  This qualifies as a quick hack.
46357 
46358   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
46359   if (VT.getSizeInBits() != 64)
46360     return SDValue();
46361 
46362   const Function &F = DAG.getMachineFunction().getFunction();
46363   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
46364   bool F64IsLegal =
46365       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
46366   if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
46367       isa<LoadSDNode>(St->getValue()) &&
46368       cast<LoadSDNode>(St->getValue())->isSimple() &&
46369       St->getChain().hasOneUse() && St->isSimple()) {
46370     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
46371 
46372     if (!ISD::isNormalLoad(Ld))
46373       return SDValue();
46374 
46375     // Avoid the transformation if there are multiple uses of the loaded value.
46376     if (!Ld->hasNUsesOfValue(1, 0))
46377       return SDValue();
46378 
46379     SDLoc LdDL(Ld);
46380     SDLoc StDL(N);
46381     // Lower to a single movq load/store pair.
46382     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
46383                                 Ld->getBasePtr(), Ld->getMemOperand());
46384 
46385     // Make sure new load is placed in same chain order.
46386     DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
46387     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
46388                         St->getMemOperand());
46389   }
46390 
46391   // This is similar to the above case, but here we handle a scalar 64-bit
46392   // integer store that is extracted from a vector on a 32-bit target.
46393   // If we have SSE2, then we can treat it like a floating-point double
46394   // to get past legalization. The execution dependencies fixup pass will
46395   // choose the optimal machine instruction for the store if this really is
46396   // an integer or v2f32 rather than an f64.
46397   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
46398       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
46399     SDValue OldExtract = St->getOperand(1);
46400     SDValue ExtOp0 = OldExtract.getOperand(0);
46401     unsigned VecSize = ExtOp0.getValueSizeInBits();
46402     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
46403     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
46404     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
46405                                      BitCast, OldExtract.getOperand(1));
46406     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
46407                         St->getPointerInfo(), St->getOriginalAlign(),
46408                         St->getMemOperand()->getFlags());
46409   }
46410 
46411   return SDValue();
46412 }
46413 
46414 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
46415                                      TargetLowering::DAGCombinerInfo &DCI,
46416                                      const X86Subtarget &Subtarget) {
46417   auto *St = cast<MemIntrinsicSDNode>(N);
46418 
46419   SDValue StoredVal = N->getOperand(1);
46420   MVT VT = StoredVal.getSimpleValueType();
46421   EVT MemVT = St->getMemoryVT();
46422 
46423   // Figure out which elements we demand.
46424   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
46425   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
46426 
46427   APInt KnownUndef, KnownZero;
46428   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46429   if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
46430                                      KnownZero, DCI)) {
46431     if (N->getOpcode() != ISD::DELETED_NODE)
46432       DCI.AddToWorklist(N);
46433     return SDValue(N, 0);
46434   }
46435 
46436   return SDValue();
46437 }
46438 
46439 /// Return 'true' if this vector operation is "horizontal"
46440 /// and return the operands for the horizontal operation in LHS and RHS.  A
46441 /// horizontal operation performs the binary operation on successive elements
46442 /// of its first operand, then on successive elements of its second operand,
46443 /// returning the resulting values in a vector.  For example, if
46444 ///   A = < float a0, float a1, float a2, float a3 >
46445 /// and
46446 ///   B = < float b0, float b1, float b2, float b3 >
46447 /// then the result of doing a horizontal operation on A and B is
46448 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
46449 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
46450 /// A horizontal-op B, for some already available A and B, and if so then LHS is
46451 /// set to A, RHS to B, and the routine returns 'true'.
46452 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
46453                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
46454                               bool IsCommutative,
46455                               SmallVectorImpl<int> &PostShuffleMask) {
46456   // If either operand is undef, bail out. The binop should be simplified.
46457   if (LHS.isUndef() || RHS.isUndef())
46458     return false;
46459 
46460   // Look for the following pattern:
46461   //   A = < float a0, float a1, float a2, float a3 >
46462   //   B = < float b0, float b1, float b2, float b3 >
46463   // and
46464   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
46465   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
46466   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
46467   // which is A horizontal-op B.
46468 
46469   MVT VT = LHS.getSimpleValueType();
46470   assert((VT.is128BitVector() || VT.is256BitVector()) &&
46471          "Unsupported vector type for horizontal add/sub");
46472   unsigned NumElts = VT.getVectorNumElements();
46473 
46474   auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
46475                         SmallVectorImpl<int> &ShuffleMask) {
46476     bool UseSubVector = false;
46477     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46478         Op.getOperand(0).getValueType().is256BitVector() &&
46479         llvm::isNullConstant(Op.getOperand(1))) {
46480       Op = Op.getOperand(0);
46481       UseSubVector = true;
46482     }
46483     SmallVector<SDValue, 2> SrcOps;
46484     SmallVector<int, 16> SrcMask, ScaledMask;
46485     SDValue BC = peekThroughBitcasts(Op);
46486     if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
46487         !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
46488           return Op.getValueSizeInBits() == BC.getValueSizeInBits();
46489         })) {
46490       resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
46491       if (!UseSubVector && SrcOps.size() <= 2 &&
46492           scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
46493         N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
46494         N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
46495         ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
46496       }
46497       if (UseSubVector && SrcOps.size() == 1 &&
46498           scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
46499         std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
46500         ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
46501         ShuffleMask.assign(Mask.begin(), Mask.end());
46502       }
46503     }
46504   };
46505 
46506   // View LHS in the form
46507   //   LHS = VECTOR_SHUFFLE A, B, LMask
46508   // If LHS is not a shuffle, then pretend it is the identity shuffle:
46509   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
46510   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
46511   SDValue A, B;
46512   SmallVector<int, 16> LMask;
46513   GetShuffle(LHS, A, B, LMask);
46514 
46515   // Likewise, view RHS in the form
46516   //   RHS = VECTOR_SHUFFLE C, D, RMask
46517   SDValue C, D;
46518   SmallVector<int, 16> RMask;
46519   GetShuffle(RHS, C, D, RMask);
46520 
46521   // At least one of the operands should be a vector shuffle.
46522   unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
46523   if (NumShuffles == 0)
46524     return false;
46525 
46526   if (LMask.empty()) {
46527     A = LHS;
46528     for (unsigned i = 0; i != NumElts; ++i)
46529       LMask.push_back(i);
46530   }
46531 
46532   if (RMask.empty()) {
46533     C = RHS;
46534     for (unsigned i = 0; i != NumElts; ++i)
46535       RMask.push_back(i);
46536   }
46537 
46538   // If we have an unary mask, ensure the other op is set to null.
46539   if (isUndefOrInRange(LMask, 0, NumElts))
46540     B = SDValue();
46541   else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
46542     A = SDValue();
46543 
46544   if (isUndefOrInRange(RMask, 0, NumElts))
46545     D = SDValue();
46546   else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
46547     C = SDValue();
46548 
46549   // If A and B occur in reverse order in RHS, then canonicalize by commuting
46550   // RHS operands and shuffle mask.
46551   if (A != C) {
46552     std::swap(C, D);
46553     ShuffleVectorSDNode::commuteMask(RMask);
46554   }
46555   // Check that the shuffles are both shuffling the same vectors.
46556   if (!(A == C && B == D))
46557     return false;
46558 
46559   PostShuffleMask.clear();
46560   PostShuffleMask.append(NumElts, SM_SentinelUndef);
46561 
46562   // LHS and RHS are now:
46563   //   LHS = shuffle A, B, LMask
46564   //   RHS = shuffle A, B, RMask
46565   // Check that the masks correspond to performing a horizontal operation.
46566   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
46567   // so we just repeat the inner loop if this is a 256-bit op.
46568   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
46569   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
46570   unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
46571   assert((NumEltsPer128BitChunk % 2 == 0) &&
46572          "Vector type should have an even number of elements in each lane");
46573   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
46574     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
46575       // Ignore undefined components.
46576       int LIdx = LMask[i + j], RIdx = RMask[i + j];
46577       if (LIdx < 0 || RIdx < 0 ||
46578           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
46579           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
46580         continue;
46581 
46582       // Check that successive odd/even elements are being operated on. If not,
46583       // this is not a horizontal operation.
46584       if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
46585           !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
46586         return false;
46587 
46588       // Compute the post-shuffle mask index based on where the element
46589       // is stored in the HOP result, and where it needs to be moved to.
46590       int Base = LIdx & ~1u;
46591       int Index = ((Base % NumEltsPer128BitChunk) / 2) +
46592                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
46593 
46594       // The  low half of the 128-bit result must choose from A.
46595       // The high half of the 128-bit result must choose from B,
46596       // unless B is undef. In that case, we are always choosing from A.
46597       if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
46598         Index += NumEltsPer64BitChunk;
46599       PostShuffleMask[i + j] = Index;
46600     }
46601   }
46602 
46603   SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
46604   SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
46605 
46606   bool IsIdentityPostShuffle =
46607       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
46608   if (IsIdentityPostShuffle)
46609     PostShuffleMask.clear();
46610 
46611   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
46612   if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
46613       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
46614     return false;
46615 
46616   // If the source nodes are already used in HorizOps then always accept this.
46617   // Shuffle folding should merge these back together.
46618   bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
46619     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46620   });
46621   bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
46622     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46623   });
46624   bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
46625 
46626   // Assume a SingleSource HOP if we only shuffle one input and don't need to
46627   // shuffle the result.
46628   if (!ForceHorizOp &&
46629       !shouldUseHorizontalOp(NewLHS == NewRHS &&
46630                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
46631                              DAG, Subtarget))
46632     return false;
46633 
46634   LHS = DAG.getBitcast(VT, NewLHS);
46635   RHS = DAG.getBitcast(VT, NewRHS);
46636   return true;
46637 }
46638 
46639 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
46640 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
46641                                          const X86Subtarget &Subtarget) {
46642   EVT VT = N->getValueType(0);
46643   unsigned Opcode = N->getOpcode();
46644   bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
46645   SmallVector<int, 8> PostShuffleMask;
46646 
46647   switch (Opcode) {
46648   case ISD::FADD:
46649   case ISD::FSUB:
46650     if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
46651         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
46652       SDValue LHS = N->getOperand(0);
46653       SDValue RHS = N->getOperand(1);
46654       auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
46655       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46656                             PostShuffleMask)) {
46657         SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
46658         if (!PostShuffleMask.empty())
46659           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46660                                             DAG.getUNDEF(VT), PostShuffleMask);
46661         return HorizBinOp;
46662       }
46663     }
46664     break;
46665   case ISD::ADD:
46666   case ISD::SUB:
46667     if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
46668                                  VT == MVT::v16i16 || VT == MVT::v8i32)) {
46669       SDValue LHS = N->getOperand(0);
46670       SDValue RHS = N->getOperand(1);
46671       auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
46672       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46673                             PostShuffleMask)) {
46674         auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
46675                                         ArrayRef<SDValue> Ops) {
46676           return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
46677         };
46678         SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
46679                                               {LHS, RHS}, HOpBuilder);
46680         if (!PostShuffleMask.empty())
46681           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46682                                             DAG.getUNDEF(VT), PostShuffleMask);
46683         return HorizBinOp;
46684       }
46685     }
46686     break;
46687   }
46688 
46689   return SDValue();
46690 }
46691 
46692 /// Do target-specific dag combines on floating-point adds/subs.
46693 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
46694                                const X86Subtarget &Subtarget) {
46695   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
46696     return HOp;
46697   return SDValue();
46698 }
46699 
46700 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
46701 /// the codegen.
46702 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
46703 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
46704 ///       anything that is guaranteed to be transformed by DAGCombiner.
46705 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
46706                                           const X86Subtarget &Subtarget,
46707                                           const SDLoc &DL) {
46708   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
46709   SDValue Src = N->getOperand(0);
46710   unsigned SrcOpcode = Src.getOpcode();
46711   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46712 
46713   EVT VT = N->getValueType(0);
46714   EVT SrcVT = Src.getValueType();
46715 
46716   auto IsFreeTruncation = [VT](SDValue Op) {
46717     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
46718 
46719     // See if this has been extended from a smaller/equal size to
46720     // the truncation size, allowing a truncation to combine with the extend.
46721     unsigned Opcode = Op.getOpcode();
46722     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
46723          Opcode == ISD::ZERO_EXTEND) &&
46724         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
46725       return true;
46726 
46727     // See if this is a single use constant which can be constant folded.
46728     // NOTE: We don't peek throught bitcasts here because there is currently
46729     // no support for constant folding truncate+bitcast+vector_of_constants. So
46730     // we'll just send up with a truncate on both operands which will
46731     // get turned back into (truncate (binop)) causing an infinite loop.
46732     return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
46733   };
46734 
46735   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
46736     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
46737     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
46738     return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
46739   };
46740 
46741   // Don't combine if the operation has other uses.
46742   if (!Src.hasOneUse())
46743     return SDValue();
46744 
46745   // Only support vector truncation for now.
46746   // TODO: i64 scalar math would benefit as well.
46747   if (!VT.isVector())
46748     return SDValue();
46749 
46750   // In most cases its only worth pre-truncating if we're only facing the cost
46751   // of one truncation.
46752   // i.e. if one of the inputs will constant fold or the input is repeated.
46753   switch (SrcOpcode) {
46754   case ISD::MUL:
46755     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
46756     // better to truncate if we have the chance.
46757     if (SrcVT.getScalarType() == MVT::i64 &&
46758         TLI.isOperationLegal(SrcOpcode, VT) &&
46759         !TLI.isOperationLegal(SrcOpcode, SrcVT))
46760       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
46761     LLVM_FALLTHROUGH;
46762   case ISD::AND:
46763   case ISD::XOR:
46764   case ISD::OR:
46765   case ISD::ADD:
46766   case ISD::SUB: {
46767     SDValue Op0 = Src.getOperand(0);
46768     SDValue Op1 = Src.getOperand(1);
46769     if (TLI.isOperationLegal(SrcOpcode, VT) &&
46770         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
46771       return TruncateArithmetic(Op0, Op1);
46772     break;
46773   }
46774   }
46775 
46776   return SDValue();
46777 }
46778 
46779 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
46780 /// e.g. trunc <8 x i32> X to <8 x i16> -->
46781 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
46782 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
46783 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
46784                                                  const X86Subtarget &Subtarget,
46785                                                  SelectionDAG &DAG) {
46786   SDValue In = N->getOperand(0);
46787   EVT InVT = In.getValueType();
46788   EVT OutVT = N->getValueType(0);
46789 
46790   APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
46791                                     OutVT.getScalarSizeInBits());
46792   In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
46793   return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
46794 }
46795 
46796 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
46797 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
46798                                                  const X86Subtarget &Subtarget,
46799                                                  SelectionDAG &DAG) {
46800   SDValue In = N->getOperand(0);
46801   EVT InVT = In.getValueType();
46802   EVT OutVT = N->getValueType(0);
46803   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
46804                    DAG.getValueType(OutVT));
46805   return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
46806 }
46807 
46808 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
46809 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
46810 /// legalization the truncation will be translated into a BUILD_VECTOR with each
46811 /// element that is extracted from a vector and then truncated, and it is
46812 /// difficult to do this optimization based on them.
46813 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
46814                                        const X86Subtarget &Subtarget) {
46815   EVT OutVT = N->getValueType(0);
46816   if (!OutVT.isVector())
46817     return SDValue();
46818 
46819   SDValue In = N->getOperand(0);
46820   if (!In.getValueType().isSimple())
46821     return SDValue();
46822 
46823   EVT InVT = In.getValueType();
46824   unsigned NumElems = OutVT.getVectorNumElements();
46825 
46826   // AVX512 provides fast truncate ops.
46827   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46828     return SDValue();
46829 
46830   EVT OutSVT = OutVT.getVectorElementType();
46831   EVT InSVT = InVT.getVectorElementType();
46832   if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
46833         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
46834         NumElems >= 8))
46835     return SDValue();
46836 
46837   // SSSE3's pshufb results in less instructions in the cases below.
46838   if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
46839     return SDValue();
46840 
46841   SDLoc DL(N);
46842   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
46843   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
46844   // truncate 2 x v4i32 to v8i16.
46845   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
46846     return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
46847   if (InSVT == MVT::i32)
46848     return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
46849 
46850   return SDValue();
46851 }
46852 
46853 /// This function transforms vector truncation of 'extended sign-bits' or
46854 /// 'extended zero-bits' values.
46855 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
46856 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
46857                                                SelectionDAG &DAG,
46858                                                const X86Subtarget &Subtarget) {
46859   // Requires SSE2.
46860   if (!Subtarget.hasSSE2())
46861     return SDValue();
46862 
46863   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
46864     return SDValue();
46865 
46866   SDValue In = N->getOperand(0);
46867   if (!In.getValueType().isSimple())
46868     return SDValue();
46869 
46870   MVT VT = N->getValueType(0).getSimpleVT();
46871   MVT SVT = VT.getScalarType();
46872 
46873   MVT InVT = In.getValueType().getSimpleVT();
46874   MVT InSVT = InVT.getScalarType();
46875 
46876   // Check we have a truncation suited for PACKSS/PACKUS.
46877   if (!isPowerOf2_32(VT.getVectorNumElements()))
46878     return SDValue();
46879   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
46880     return SDValue();
46881   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
46882     return SDValue();
46883 
46884   // Truncation to sub-128bit vXi32 can be better handled with shuffles.
46885   if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
46886     return SDValue();
46887 
46888   // AVX512 has fast truncate, but if the input is already going to be split,
46889   // there's no harm in trying pack.
46890   if (Subtarget.hasAVX512() &&
46891       !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
46892         InVT.is512BitVector())) {
46893     // PACK should still be worth it for 128-bit vectors if the sources were
46894     // originally concatenated from subvectors.
46895     SmallVector<SDValue> ConcatOps;
46896     if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
46897     return SDValue();
46898   }
46899 
46900   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
46901   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
46902 
46903   // Use PACKUS if the input has zero-bits that extend all the way to the
46904   // packed/truncated value. e.g. masks, zext_in_reg, etc.
46905   KnownBits Known = DAG.computeKnownBits(In);
46906   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
46907   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
46908     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
46909 
46910   // Use PACKSS if the input has sign-bits that extend all the way to the
46911   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
46912   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
46913 
46914   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
46915   // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
46916   // on and combines/simplifications can't then use it.
46917   if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
46918     return SDValue();
46919 
46920   unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
46921   if (NumSignBits > MinSignBits)
46922     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
46923 
46924   // If we have a srl that only generates signbits that we will discard in
46925   // the truncation then we can use PACKSS by converting the srl to a sra.
46926   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
46927   if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
46928     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
46929             In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
46930       if (*ShAmt == MinSignBits) {
46931         SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
46932         return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
46933                                       Subtarget);
46934       }
46935     }
46936 
46937   return SDValue();
46938 }
46939 
46940 // Try to form a MULHU or MULHS node by looking for
46941 // (trunc (srl (mul ext, ext), 16))
46942 // TODO: This is X86 specific because we want to be able to handle wide types
46943 // before type legalization. But we can only do it if the vector will be
46944 // legalized via widening/splitting. Type legalization can't handle promotion
46945 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46946 // combiner.
46947 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
46948                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46949   // First instruction should be a right shift of a multiply.
46950   if (Src.getOpcode() != ISD::SRL ||
46951       Src.getOperand(0).getOpcode() != ISD::MUL)
46952     return SDValue();
46953 
46954   if (!Subtarget.hasSSE2())
46955     return SDValue();
46956 
46957   // Only handle vXi16 types that are at least 128-bits unless they will be
46958   // widened.
46959   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
46960     return SDValue();
46961 
46962   // Input type should be at least vXi32.
46963   EVT InVT = Src.getValueType();
46964   if (InVT.getVectorElementType().getSizeInBits() < 32)
46965     return SDValue();
46966 
46967   // Need a shift by 16.
46968   APInt ShiftAmt;
46969   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
46970       ShiftAmt != 16)
46971     return SDValue();
46972 
46973   SDValue LHS = Src.getOperand(0).getOperand(0);
46974   SDValue RHS = Src.getOperand(0).getOperand(1);
46975 
46976   unsigned ExtOpc = LHS.getOpcode();
46977   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46978       RHS.getOpcode() != ExtOpc)
46979     return SDValue();
46980 
46981   // Peek through the extends.
46982   LHS = LHS.getOperand(0);
46983   RHS = RHS.getOperand(0);
46984 
46985   // Ensure the input types match.
46986   if (LHS.getValueType() != VT || RHS.getValueType() != VT)
46987     return SDValue();
46988 
46989   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46990   return DAG.getNode(Opc, DL, VT, LHS, RHS);
46991 }
46992 
46993 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
46994 // from one vector with signed bytes from another vector, adds together
46995 // adjacent pairs of 16-bit products, and saturates the result before
46996 // truncating to 16-bits.
46997 //
46998 // Which looks something like this:
46999 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
47000 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
47001 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
47002                                const X86Subtarget &Subtarget,
47003                                const SDLoc &DL) {
47004   if (!VT.isVector() || !Subtarget.hasSSSE3())
47005     return SDValue();
47006 
47007   unsigned NumElems = VT.getVectorNumElements();
47008   EVT ScalarVT = VT.getVectorElementType();
47009   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
47010     return SDValue();
47011 
47012   SDValue SSatVal = detectSSatPattern(In, VT);
47013   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
47014     return SDValue();
47015 
47016   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
47017   // of multiplies from even/odd elements.
47018   SDValue N0 = SSatVal.getOperand(0);
47019   SDValue N1 = SSatVal.getOperand(1);
47020 
47021   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
47022     return SDValue();
47023 
47024   SDValue N00 = N0.getOperand(0);
47025   SDValue N01 = N0.getOperand(1);
47026   SDValue N10 = N1.getOperand(0);
47027   SDValue N11 = N1.getOperand(1);
47028 
47029   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
47030   // Canonicalize zero_extend to LHS.
47031   if (N01.getOpcode() == ISD::ZERO_EXTEND)
47032     std::swap(N00, N01);
47033   if (N11.getOpcode() == ISD::ZERO_EXTEND)
47034     std::swap(N10, N11);
47035 
47036   // Ensure we have a zero_extend and a sign_extend.
47037   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
47038       N01.getOpcode() != ISD::SIGN_EXTEND ||
47039       N10.getOpcode() != ISD::ZERO_EXTEND ||
47040       N11.getOpcode() != ISD::SIGN_EXTEND)
47041     return SDValue();
47042 
47043   // Peek through the extends.
47044   N00 = N00.getOperand(0);
47045   N01 = N01.getOperand(0);
47046   N10 = N10.getOperand(0);
47047   N11 = N11.getOperand(0);
47048 
47049   // Ensure the extend is from vXi8.
47050   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
47051       N01.getValueType().getVectorElementType() != MVT::i8 ||
47052       N10.getValueType().getVectorElementType() != MVT::i8 ||
47053       N11.getValueType().getVectorElementType() != MVT::i8)
47054     return SDValue();
47055 
47056   // All inputs should be build_vectors.
47057   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
47058       N01.getOpcode() != ISD::BUILD_VECTOR ||
47059       N10.getOpcode() != ISD::BUILD_VECTOR ||
47060       N11.getOpcode() != ISD::BUILD_VECTOR)
47061     return SDValue();
47062 
47063   // N00/N10 are zero extended. N01/N11 are sign extended.
47064 
47065   // For each element, we need to ensure we have an odd element from one vector
47066   // multiplied by the odd element of another vector and the even element from
47067   // one of the same vectors being multiplied by the even element from the
47068   // other vector. So we need to make sure for each element i, this operator
47069   // is being performed:
47070   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
47071   SDValue ZExtIn, SExtIn;
47072   for (unsigned i = 0; i != NumElems; ++i) {
47073     SDValue N00Elt = N00.getOperand(i);
47074     SDValue N01Elt = N01.getOperand(i);
47075     SDValue N10Elt = N10.getOperand(i);
47076     SDValue N11Elt = N11.getOperand(i);
47077     // TODO: Be more tolerant to undefs.
47078     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47079         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47080         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47081         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
47082       return SDValue();
47083     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
47084     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
47085     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
47086     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
47087     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
47088       return SDValue();
47089     unsigned IdxN00 = ConstN00Elt->getZExtValue();
47090     unsigned IdxN01 = ConstN01Elt->getZExtValue();
47091     unsigned IdxN10 = ConstN10Elt->getZExtValue();
47092     unsigned IdxN11 = ConstN11Elt->getZExtValue();
47093     // Add is commutative so indices can be reordered.
47094     if (IdxN00 > IdxN10) {
47095       std::swap(IdxN00, IdxN10);
47096       std::swap(IdxN01, IdxN11);
47097     }
47098     // N0 indices be the even element. N1 indices must be the next odd element.
47099     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
47100         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
47101       return SDValue();
47102     SDValue N00In = N00Elt.getOperand(0);
47103     SDValue N01In = N01Elt.getOperand(0);
47104     SDValue N10In = N10Elt.getOperand(0);
47105     SDValue N11In = N11Elt.getOperand(0);
47106     // First time we find an input capture it.
47107     if (!ZExtIn) {
47108       ZExtIn = N00In;
47109       SExtIn = N01In;
47110     }
47111     if (ZExtIn != N00In || SExtIn != N01In ||
47112         ZExtIn != N10In || SExtIn != N11In)
47113       return SDValue();
47114   }
47115 
47116   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47117                          ArrayRef<SDValue> Ops) {
47118     // Shrink by adding truncate nodes and let DAGCombine fold with the
47119     // sources.
47120     EVT InVT = Ops[0].getValueType();
47121     assert(InVT.getScalarType() == MVT::i8 &&
47122            "Unexpected scalar element type");
47123     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
47124     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47125                                  InVT.getVectorNumElements() / 2);
47126     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
47127   };
47128   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
47129                           PMADDBuilder);
47130 }
47131 
47132 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
47133                                const X86Subtarget &Subtarget) {
47134   EVT VT = N->getValueType(0);
47135   SDValue Src = N->getOperand(0);
47136   SDLoc DL(N);
47137 
47138   // Attempt to pre-truncate inputs to arithmetic ops instead.
47139   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
47140     return V;
47141 
47142   // Try to detect AVG pattern first.
47143   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
47144     return Avg;
47145 
47146   // Try to detect PMADD
47147   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
47148     return PMAdd;
47149 
47150   // Try to combine truncation with signed/unsigned saturation.
47151   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
47152     return Val;
47153 
47154   // Try to combine PMULHUW/PMULHW for vXi16.
47155   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
47156     return V;
47157 
47158   // The bitcast source is a direct mmx result.
47159   // Detect bitcasts between i32 to x86mmx
47160   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
47161     SDValue BCSrc = Src.getOperand(0);
47162     if (BCSrc.getValueType() == MVT::x86mmx)
47163       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
47164   }
47165 
47166   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
47167   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
47168     return V;
47169 
47170   return combineVectorTruncation(N, DAG, Subtarget);
47171 }
47172 
47173 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
47174                              TargetLowering::DAGCombinerInfo &DCI) {
47175   EVT VT = N->getValueType(0);
47176   SDValue In = N->getOperand(0);
47177   SDLoc DL(N);
47178 
47179   if (auto SSatVal = detectSSatPattern(In, VT))
47180     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
47181   if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
47182     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
47183 
47184   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47185   APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
47186   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47187     return SDValue(N, 0);
47188 
47189   return SDValue();
47190 }
47191 
47192 /// Returns the negated value if the node \p N flips sign of FP value.
47193 ///
47194 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
47195 /// or FSUB(0, x)
47196 /// AVX512F does not have FXOR, so FNEG is lowered as
47197 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
47198 /// In this case we go though all bitcasts.
47199 /// This also recognizes splat of a negated value and returns the splat of that
47200 /// value.
47201 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
47202   if (N->getOpcode() == ISD::FNEG)
47203     return N->getOperand(0);
47204 
47205   // Don't recurse exponentially.
47206   if (Depth > SelectionDAG::MaxRecursionDepth)
47207     return SDValue();
47208 
47209   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
47210 
47211   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
47212   EVT VT = Op->getValueType(0);
47213 
47214   // Make sure the element size doesn't change.
47215   if (VT.getScalarSizeInBits() != ScalarSize)
47216     return SDValue();
47217 
47218   unsigned Opc = Op.getOpcode();
47219   switch (Opc) {
47220   case ISD::VECTOR_SHUFFLE: {
47221     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
47222     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
47223     if (!Op.getOperand(1).isUndef())
47224       return SDValue();
47225     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
47226       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
47227         return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
47228                                     cast<ShuffleVectorSDNode>(Op)->getMask());
47229     break;
47230   }
47231   case ISD::INSERT_VECTOR_ELT: {
47232     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
47233     // -V, INDEX).
47234     SDValue InsVector = Op.getOperand(0);
47235     SDValue InsVal = Op.getOperand(1);
47236     if (!InsVector.isUndef())
47237       return SDValue();
47238     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
47239       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
47240         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
47241                            NegInsVal, Op.getOperand(2));
47242     break;
47243   }
47244   case ISD::FSUB:
47245   case ISD::XOR:
47246   case X86ISD::FXOR: {
47247     SDValue Op1 = Op.getOperand(1);
47248     SDValue Op0 = Op.getOperand(0);
47249 
47250     // For XOR and FXOR, we want to check if constant
47251     // bits of Op1 are sign bit masks. For FSUB, we
47252     // have to check if constant bits of Op0 are sign
47253     // bit masks and hence we swap the operands.
47254     if (Opc == ISD::FSUB)
47255       std::swap(Op0, Op1);
47256 
47257     APInt UndefElts;
47258     SmallVector<APInt, 16> EltBits;
47259     // Extract constant bits and see if they are all
47260     // sign bit masks. Ignore the undef elements.
47261     if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
47262                                       /* AllowWholeUndefs */ true,
47263                                       /* AllowPartialUndefs */ false)) {
47264       for (unsigned I = 0, E = EltBits.size(); I < E; I++)
47265         if (!UndefElts[I] && !EltBits[I].isSignMask())
47266           return SDValue();
47267 
47268       return peekThroughBitcasts(Op0);
47269     }
47270   }
47271   }
47272 
47273   return SDValue();
47274 }
47275 
47276 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
47277                                 bool NegRes) {
47278   if (NegMul) {
47279     switch (Opcode) {
47280     default: llvm_unreachable("Unexpected opcode");
47281     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
47282     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
47283     case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
47284     case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
47285     case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
47286     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
47287     case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
47288     case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
47289     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
47290     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
47291     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
47292     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
47293     }
47294   }
47295 
47296   if (NegAcc) {
47297     switch (Opcode) {
47298     default: llvm_unreachable("Unexpected opcode");
47299     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
47300     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
47301     case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
47302     case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
47303     case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
47304     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
47305     case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
47306     case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
47307     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
47308     case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
47309     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
47310     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
47311     case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
47312     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
47313     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
47314     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
47315     }
47316   }
47317 
47318   if (NegRes) {
47319     switch (Opcode) {
47320     // For accuracy reason, we never combine fneg and fma under strict FP.
47321     default: llvm_unreachable("Unexpected opcode");
47322     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
47323     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
47324     case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
47325     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
47326     case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
47327     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
47328     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
47329     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
47330     }
47331   }
47332 
47333   return Opcode;
47334 }
47335 
47336 /// Do target-specific dag combines on floating point negations.
47337 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
47338                            TargetLowering::DAGCombinerInfo &DCI,
47339                            const X86Subtarget &Subtarget) {
47340   EVT OrigVT = N->getValueType(0);
47341   SDValue Arg = isFNEG(DAG, N);
47342   if (!Arg)
47343     return SDValue();
47344 
47345   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47346   EVT VT = Arg.getValueType();
47347   EVT SVT = VT.getScalarType();
47348   SDLoc DL(N);
47349 
47350   // Let legalize expand this if it isn't a legal type yet.
47351   if (!TLI.isTypeLegal(VT))
47352     return SDValue();
47353 
47354   // If we're negating a FMUL node on a target with FMA, then we can avoid the
47355   // use of a constant by performing (-0 - A*B) instead.
47356   // FIXME: Check rounding control flags as well once it becomes available.
47357   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
47358       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
47359     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
47360     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
47361                                   Arg.getOperand(1), Zero);
47362     return DAG.getBitcast(OrigVT, NewNode);
47363   }
47364 
47365   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47366   bool LegalOperations = !DCI.isBeforeLegalizeOps();
47367   if (SDValue NegArg =
47368           TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
47369     return DAG.getBitcast(OrigVT, NegArg);
47370 
47371   return SDValue();
47372 }
47373 
47374 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
47375                                                 bool LegalOperations,
47376                                                 bool ForCodeSize,
47377                                                 NegatibleCost &Cost,
47378                                                 unsigned Depth) const {
47379   // fneg patterns are removable even if they have multiple uses.
47380   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
47381     Cost = NegatibleCost::Cheaper;
47382     return DAG.getBitcast(Op.getValueType(), Arg);
47383   }
47384 
47385   EVT VT = Op.getValueType();
47386   EVT SVT = VT.getScalarType();
47387   unsigned Opc = Op.getOpcode();
47388   SDNodeFlags Flags = Op.getNode()->getFlags();
47389   switch (Opc) {
47390   case ISD::FMA:
47391   case X86ISD::FMSUB:
47392   case X86ISD::FNMADD:
47393   case X86ISD::FNMSUB:
47394   case X86ISD::FMADD_RND:
47395   case X86ISD::FMSUB_RND:
47396   case X86ISD::FNMADD_RND:
47397   case X86ISD::FNMSUB_RND: {
47398     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
47399         !(SVT == MVT::f32 || SVT == MVT::f64) ||
47400         !isOperationLegal(ISD::FMA, VT))
47401       break;
47402 
47403     // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
47404     // if it may have signed zeros.
47405     if (!Flags.hasNoSignedZeros())
47406       break;
47407 
47408     // This is always negatible for free but we might be able to remove some
47409     // extra operand negations as well.
47410     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
47411     for (int i = 0; i != 3; ++i)
47412       NewOps[i] = getCheaperNegatedExpression(
47413           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
47414 
47415     bool NegA = !!NewOps[0];
47416     bool NegB = !!NewOps[1];
47417     bool NegC = !!NewOps[2];
47418     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
47419 
47420     Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
47421                                   : NegatibleCost::Neutral;
47422 
47423     // Fill in the non-negated ops with the original values.
47424     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
47425       if (!NewOps[i])
47426         NewOps[i] = Op.getOperand(i);
47427     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
47428   }
47429   case X86ISD::FRCP:
47430     if (SDValue NegOp0 =
47431             getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
47432                                  ForCodeSize, Cost, Depth + 1))
47433       return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
47434     break;
47435   }
47436 
47437   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
47438                                               ForCodeSize, Cost, Depth);
47439 }
47440 
47441 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
47442                                  const X86Subtarget &Subtarget) {
47443   MVT VT = N->getSimpleValueType(0);
47444   // If we have integer vector types available, use the integer opcodes.
47445   if (!VT.isVector() || !Subtarget.hasSSE2())
47446     return SDValue();
47447 
47448   SDLoc dl(N);
47449 
47450   unsigned IntBits = VT.getScalarSizeInBits();
47451   MVT IntSVT = MVT::getIntegerVT(IntBits);
47452   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
47453 
47454   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
47455   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
47456   unsigned IntOpcode;
47457   switch (N->getOpcode()) {
47458   default: llvm_unreachable("Unexpected FP logic op");
47459   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
47460   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
47461   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
47462   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
47463   }
47464   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
47465   return DAG.getBitcast(VT, IntOp);
47466 }
47467 
47468 
47469 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
47470 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
47471   if (N->getOpcode() != ISD::XOR)
47472     return SDValue();
47473 
47474   SDValue LHS = N->getOperand(0);
47475   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
47476     return SDValue();
47477 
47478   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
47479       X86::CondCode(LHS->getConstantOperandVal(0)));
47480   SDLoc DL(N);
47481   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
47482 }
47483 
47484 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
47485                           TargetLowering::DAGCombinerInfo &DCI,
47486                           const X86Subtarget &Subtarget) {
47487   SDValue N0 = N->getOperand(0);
47488   SDValue N1 = N->getOperand(1);
47489   EVT VT = N->getValueType(0);
47490 
47491   // If this is SSE1 only convert to FXOR to avoid scalarization.
47492   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47493     return DAG.getBitcast(MVT::v4i32,
47494                           DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
47495                                       DAG.getBitcast(MVT::v4f32, N0),
47496                                       DAG.getBitcast(MVT::v4f32, N1)));
47497   }
47498 
47499   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
47500     return Cmp;
47501 
47502   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47503     return R;
47504 
47505   if (DCI.isBeforeLegalizeOps())
47506     return SDValue();
47507 
47508   if (SDValue SetCC = foldXor1SetCC(N, DAG))
47509     return SetCC;
47510 
47511   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
47512     return RV;
47513 
47514   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
47515   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47516   if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
47517       N0.getOperand(0).getValueType().isVector() &&
47518       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
47519       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
47520     return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
47521                                          N0.getOperand(0).getValueType()));
47522   }
47523 
47524   // Handle AVX512 mask widening.
47525   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
47526   if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
47527       VT.getVectorElementType() == MVT::i1 &&
47528       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
47529       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
47530     return DAG.getNode(
47531         ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
47532         DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
47533         N0.getOperand(2));
47534   }
47535 
47536   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
47537   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
47538   // TODO: Under what circumstances could this be performed in DAGCombine?
47539   if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
47540       N0.getOperand(0).getOpcode() == N->getOpcode()) {
47541     SDValue TruncExtSrc = N0.getOperand(0);
47542     auto *N1C = dyn_cast<ConstantSDNode>(N1);
47543     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
47544     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
47545       SDLoc DL(N);
47546       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
47547       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
47548       return DAG.getNode(ISD::XOR, DL, VT, LHS,
47549                          DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
47550     }
47551   }
47552 
47553   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
47554     return FPLogic;
47555 
47556   return combineFneg(N, DAG, DCI, Subtarget);
47557 }
47558 
47559 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
47560                             TargetLowering::DAGCombinerInfo &DCI,
47561                             const X86Subtarget &Subtarget) {
47562   EVT VT = N->getValueType(0);
47563   unsigned NumBits = VT.getSizeInBits();
47564 
47565   // TODO - Constant Folding.
47566 
47567   // Simplify the inputs.
47568   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47569   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47570   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47571     return SDValue(N, 0);
47572 
47573   return SDValue();
47574 }
47575 
47576 static bool isNullFPScalarOrVectorConst(SDValue V) {
47577   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
47578 }
47579 
47580 /// If a value is a scalar FP zero or a vector FP zero (potentially including
47581 /// undefined elements), return a zero constant that may be used to fold away
47582 /// that value. In the case of a vector, the returned constant will not contain
47583 /// undefined elements even if the input parameter does. This makes it suitable
47584 /// to be used as a replacement operand with operations (eg, bitwise-and) where
47585 /// an undef should not propagate.
47586 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
47587                                         const X86Subtarget &Subtarget) {
47588   if (!isNullFPScalarOrVectorConst(V))
47589     return SDValue();
47590 
47591   if (V.getValueType().isVector())
47592     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
47593 
47594   return V;
47595 }
47596 
47597 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
47598                                       const X86Subtarget &Subtarget) {
47599   SDValue N0 = N->getOperand(0);
47600   SDValue N1 = N->getOperand(1);
47601   EVT VT = N->getValueType(0);
47602   SDLoc DL(N);
47603 
47604   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
47605   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
47606         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
47607         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
47608     return SDValue();
47609 
47610   auto isAllOnesConstantFP = [](SDValue V) {
47611     if (V.getSimpleValueType().isVector())
47612       return ISD::isBuildVectorAllOnes(V.getNode());
47613     auto *C = dyn_cast<ConstantFPSDNode>(V);
47614     return C && C->getConstantFPValue()->isAllOnesValue();
47615   };
47616 
47617   // fand (fxor X, -1), Y --> fandn X, Y
47618   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
47619     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
47620 
47621   // fand X, (fxor Y, -1) --> fandn Y, X
47622   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
47623     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
47624 
47625   return SDValue();
47626 }
47627 
47628 /// Do target-specific dag combines on X86ISD::FAND nodes.
47629 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
47630                            const X86Subtarget &Subtarget) {
47631   // FAND(0.0, x) -> 0.0
47632   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
47633     return V;
47634 
47635   // FAND(x, 0.0) -> 0.0
47636   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47637     return V;
47638 
47639   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
47640     return V;
47641 
47642   return lowerX86FPLogicOp(N, DAG, Subtarget);
47643 }
47644 
47645 /// Do target-specific dag combines on X86ISD::FANDN nodes.
47646 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
47647                             const X86Subtarget &Subtarget) {
47648   // FANDN(0.0, x) -> x
47649   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47650     return N->getOperand(1);
47651 
47652   // FANDN(x, 0.0) -> 0.0
47653   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47654     return V;
47655 
47656   return lowerX86FPLogicOp(N, DAG, Subtarget);
47657 }
47658 
47659 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
47660 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
47661                           TargetLowering::DAGCombinerInfo &DCI,
47662                           const X86Subtarget &Subtarget) {
47663   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
47664 
47665   // F[X]OR(0.0, x) -> x
47666   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47667     return N->getOperand(1);
47668 
47669   // F[X]OR(x, 0.0) -> x
47670   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
47671     return N->getOperand(0);
47672 
47673   if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
47674     return NewVal;
47675 
47676   return lowerX86FPLogicOp(N, DAG, Subtarget);
47677 }
47678 
47679 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
47680 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
47681   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
47682 
47683   // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
47684   if (!DAG.getTarget().Options.NoNaNsFPMath ||
47685       !DAG.getTarget().Options.NoSignedZerosFPMath)
47686     return SDValue();
47687 
47688   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
47689   // into FMINC and FMAXC, which are Commutative operations.
47690   unsigned NewOp = 0;
47691   switch (N->getOpcode()) {
47692     default: llvm_unreachable("unknown opcode");
47693     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
47694     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
47695   }
47696 
47697   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
47698                      N->getOperand(0), N->getOperand(1));
47699 }
47700 
47701 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
47702                                      const X86Subtarget &Subtarget) {
47703   if (Subtarget.useSoftFloat())
47704     return SDValue();
47705 
47706   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47707 
47708   EVT VT = N->getValueType(0);
47709   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
47710         (Subtarget.hasSSE2() && VT == MVT::f64) ||
47711         (VT.isVector() && TLI.isTypeLegal(VT))))
47712     return SDValue();
47713 
47714   SDValue Op0 = N->getOperand(0);
47715   SDValue Op1 = N->getOperand(1);
47716   SDLoc DL(N);
47717   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
47718 
47719   // If we don't have to respect NaN inputs, this is a direct translation to x86
47720   // min/max instructions.
47721   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
47722     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47723 
47724   // If one of the operands is known non-NaN use the native min/max instructions
47725   // with the non-NaN input as second operand.
47726   if (DAG.isKnownNeverNaN(Op1))
47727     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47728   if (DAG.isKnownNeverNaN(Op0))
47729     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
47730 
47731   // If we have to respect NaN inputs, this takes at least 3 instructions.
47732   // Favor a library call when operating on a scalar and minimizing code size.
47733   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
47734     return SDValue();
47735 
47736   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
47737                                          VT);
47738 
47739   // There are 4 possibilities involving NaN inputs, and these are the required
47740   // outputs:
47741   //                   Op1
47742   //               Num     NaN
47743   //            ----------------
47744   //       Num  |  Max  |  Op0 |
47745   // Op0        ----------------
47746   //       NaN  |  Op1  |  NaN |
47747   //            ----------------
47748   //
47749   // The SSE FP max/min instructions were not designed for this case, but rather
47750   // to implement:
47751   //   Min = Op1 < Op0 ? Op1 : Op0
47752   //   Max = Op1 > Op0 ? Op1 : Op0
47753   //
47754   // So they always return Op0 if either input is a NaN. However, we can still
47755   // use those instructions for fmaxnum by selecting away a NaN input.
47756 
47757   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
47758   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
47759   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
47760 
47761   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
47762   // are NaN, the NaN value of Op1 is the result.
47763   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
47764 }
47765 
47766 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
47767                                    TargetLowering::DAGCombinerInfo &DCI) {
47768   EVT VT = N->getValueType(0);
47769   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47770 
47771   APInt KnownUndef, KnownZero;
47772   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
47773   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
47774                                      KnownZero, DCI))
47775     return SDValue(N, 0);
47776 
47777   // Convert a full vector load into vzload when not all bits are needed.
47778   SDValue In = N->getOperand(0);
47779   MVT InVT = In.getSimpleValueType();
47780   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47781       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47782     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47783     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
47784     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47785     MVT MemVT = MVT::getIntegerVT(NumBits);
47786     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47787     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47788       SDLoc dl(N);
47789       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
47790                                     DAG.getBitcast(InVT, VZLoad));
47791       DCI.CombineTo(N, Convert);
47792       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47793       DCI.recursivelyDeleteUnusedNodes(LN);
47794       return SDValue(N, 0);
47795     }
47796   }
47797 
47798   return SDValue();
47799 }
47800 
47801 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
47802                                      TargetLowering::DAGCombinerInfo &DCI) {
47803   bool IsStrict = N->isTargetStrictFPOpcode();
47804   EVT VT = N->getValueType(0);
47805 
47806   // Convert a full vector load into vzload when not all bits are needed.
47807   SDValue In = N->getOperand(IsStrict ? 1 : 0);
47808   MVT InVT = In.getSimpleValueType();
47809   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47810       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47811     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
47812     LoadSDNode *LN = cast<LoadSDNode>(In);
47813     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47814     MVT MemVT = MVT::getFloatingPointVT(NumBits);
47815     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47816     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47817       SDLoc dl(N);
47818       if (IsStrict) {
47819         SDValue Convert =
47820             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
47821                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
47822         DCI.CombineTo(N, Convert, Convert.getValue(1));
47823       } else {
47824         SDValue Convert =
47825             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
47826         DCI.CombineTo(N, Convert);
47827       }
47828       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47829       DCI.recursivelyDeleteUnusedNodes(LN);
47830       return SDValue(N, 0);
47831     }
47832   }
47833 
47834   return SDValue();
47835 }
47836 
47837 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
47838 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
47839                             TargetLowering::DAGCombinerInfo &DCI,
47840                             const X86Subtarget &Subtarget) {
47841   MVT VT = N->getSimpleValueType(0);
47842 
47843   // ANDNP(0, x) -> x
47844   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
47845     return N->getOperand(1);
47846 
47847   // ANDNP(x, 0) -> 0
47848   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
47849     return DAG.getConstant(0, SDLoc(N), VT);
47850 
47851   // Turn ANDNP back to AND if input is inverted.
47852   if (SDValue Not = IsNOT(N->getOperand(0), DAG))
47853     return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
47854                        N->getOperand(1));
47855 
47856   // Attempt to recursively combine a bitmask ANDNP with shuffles.
47857   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47858     SDValue Op(N, 0);
47859     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47860       return Res;
47861   }
47862 
47863   return SDValue();
47864 }
47865 
47866 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
47867                          TargetLowering::DAGCombinerInfo &DCI) {
47868   SDValue N1 = N->getOperand(1);
47869 
47870   // BT ignores high bits in the bit index operand.
47871   unsigned BitWidth = N1.getValueSizeInBits();
47872   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
47873   if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
47874     if (N->getOpcode() != ISD::DELETED_NODE)
47875       DCI.AddToWorklist(N);
47876     return SDValue(N, 0);
47877   }
47878 
47879   return SDValue();
47880 }
47881 
47882 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
47883                                TargetLowering::DAGCombinerInfo &DCI) {
47884   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
47885   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
47886 
47887   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
47888     APInt KnownUndef, KnownZero;
47889     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47890     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
47891     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
47892                                        DCI)) {
47893       if (N->getOpcode() != ISD::DELETED_NODE)
47894         DCI.AddToWorklist(N);
47895       return SDValue(N, 0);
47896     }
47897 
47898     // Convert a full vector load into vzload when not all bits are needed.
47899     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
47900       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
47901       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
47902         SDLoc dl(N);
47903         if (IsStrict) {
47904           SDValue Convert = DAG.getNode(
47905               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
47906               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
47907           DCI.CombineTo(N, Convert, Convert.getValue(1));
47908         } else {
47909           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
47910                                         DAG.getBitcast(MVT::v8i16, VZLoad));
47911           DCI.CombineTo(N, Convert);
47912         }
47913 
47914         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47915         DCI.recursivelyDeleteUnusedNodes(LN);
47916         return SDValue(N, 0);
47917       }
47918     }
47919   }
47920 
47921   return SDValue();
47922 }
47923 
47924 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
47925 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
47926   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47927 
47928   EVT DstVT = N->getValueType(0);
47929 
47930   SDValue N0 = N->getOperand(0);
47931   SDValue N1 = N->getOperand(1);
47932   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47933 
47934   if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
47935     return SDValue();
47936 
47937   // Look through single use any_extends / truncs.
47938   SDValue IntermediateBitwidthOp;
47939   if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
47940       N0.hasOneUse()) {
47941     IntermediateBitwidthOp = N0;
47942     N0 = N0.getOperand(0);
47943   }
47944 
47945   // See if we have a single use cmov.
47946   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
47947     return SDValue();
47948 
47949   SDValue CMovOp0 = N0.getOperand(0);
47950   SDValue CMovOp1 = N0.getOperand(1);
47951 
47952   // Make sure both operands are constants.
47953   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47954       !isa<ConstantSDNode>(CMovOp1.getNode()))
47955     return SDValue();
47956 
47957   SDLoc DL(N);
47958 
47959   // If we looked through an any_extend/trunc above, add one to the constants.
47960   if (IntermediateBitwidthOp) {
47961     unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
47962     CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
47963     CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
47964   }
47965 
47966   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
47967   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
47968 
47969   EVT CMovVT = DstVT;
47970   // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
47971   if (DstVT == MVT::i16) {
47972     CMovVT = MVT::i32;
47973     CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
47974     CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
47975   }
47976 
47977   SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
47978                              N0.getOperand(2), N0.getOperand(3));
47979 
47980   if (CMovVT != DstVT)
47981     CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
47982 
47983   return CMov;
47984 }
47985 
47986 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
47987                                       const X86Subtarget &Subtarget) {
47988   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
47989 
47990   if (SDValue V = combineSextInRegCmov(N, DAG))
47991     return V;
47992 
47993   EVT VT = N->getValueType(0);
47994   SDValue N0 = N->getOperand(0);
47995   SDValue N1 = N->getOperand(1);
47996   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47997   SDLoc dl(N);
47998 
47999   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
48000   // both SSE and AVX2 since there is no sign-extended shift right
48001   // operation on a vector with 64-bit elements.
48002   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
48003   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
48004   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
48005                            N0.getOpcode() == ISD::SIGN_EXTEND)) {
48006     SDValue N00 = N0.getOperand(0);
48007 
48008     // EXTLOAD has a better solution on AVX2,
48009     // it may be replaced with X86ISD::VSEXT node.
48010     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
48011       if (!ISD::isNormalLoad(N00.getNode()))
48012         return SDValue();
48013 
48014     // Attempt to promote any comparison mask ops before moving the
48015     // SIGN_EXTEND_INREG in the way.
48016     if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
48017       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
48018 
48019     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
48020       SDValue Tmp =
48021           DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
48022       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
48023     }
48024   }
48025   return SDValue();
48026 }
48027 
48028 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
48029 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
48030 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
48031 /// opportunities to combine math ops, use an LEA, or use a complex addressing
48032 /// mode. This can eliminate extend, add, and shift instructions.
48033 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
48034                                    const X86Subtarget &Subtarget) {
48035   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
48036       Ext->getOpcode() != ISD::ZERO_EXTEND)
48037     return SDValue();
48038 
48039   // TODO: This should be valid for other integer types.
48040   EVT VT = Ext->getValueType(0);
48041   if (VT != MVT::i64)
48042     return SDValue();
48043 
48044   SDValue Add = Ext->getOperand(0);
48045   if (Add.getOpcode() != ISD::ADD)
48046     return SDValue();
48047 
48048   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
48049   bool NSW = Add->getFlags().hasNoSignedWrap();
48050   bool NUW = Add->getFlags().hasNoUnsignedWrap();
48051 
48052   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
48053   // into the 'zext'
48054   if ((Sext && !NSW) || (!Sext && !NUW))
48055     return SDValue();
48056 
48057   // Having a constant operand to the 'add' ensures that we are not increasing
48058   // the instruction count because the constant is extended for free below.
48059   // A constant operand can also become the displacement field of an LEA.
48060   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
48061   if (!AddOp1)
48062     return SDValue();
48063 
48064   // Don't make the 'add' bigger if there's no hope of combining it with some
48065   // other 'add' or 'shl' instruction.
48066   // TODO: It may be profitable to generate simpler LEA instructions in place
48067   // of single 'add' instructions, but the cost model for selecting an LEA
48068   // currently has a high threshold.
48069   bool HasLEAPotential = false;
48070   for (auto *User : Ext->uses()) {
48071     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
48072       HasLEAPotential = true;
48073       break;
48074     }
48075   }
48076   if (!HasLEAPotential)
48077     return SDValue();
48078 
48079   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
48080   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
48081   SDValue AddOp0 = Add.getOperand(0);
48082   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
48083   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
48084 
48085   // The wider add is guaranteed to not wrap because both operands are
48086   // sign-extended.
48087   SDNodeFlags Flags;
48088   Flags.setNoSignedWrap(NSW);
48089   Flags.setNoUnsignedWrap(NUW);
48090   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
48091 }
48092 
48093 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
48094 // operands and the result of CMOV is not used anywhere else - promote CMOV
48095 // itself instead of promoting its result. This could be beneficial, because:
48096 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
48097 //        (or more) pseudo-CMOVs only when they go one-after-another and
48098 //        getting rid of result extension code after CMOV will help that.
48099 //     2) Promotion of constant CMOV arguments is free, hence the
48100 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
48101 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
48102 //        promotion is also good in terms of code-size.
48103 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
48104 //         promotion).
48105 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
48106   SDValue CMovN = Extend->getOperand(0);
48107   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
48108     return SDValue();
48109 
48110   EVT TargetVT = Extend->getValueType(0);
48111   unsigned ExtendOpcode = Extend->getOpcode();
48112   SDLoc DL(Extend);
48113 
48114   EVT VT = CMovN.getValueType();
48115   SDValue CMovOp0 = CMovN.getOperand(0);
48116   SDValue CMovOp1 = CMovN.getOperand(1);
48117 
48118   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48119       !isa<ConstantSDNode>(CMovOp1.getNode()))
48120     return SDValue();
48121 
48122   // Only extend to i32 or i64.
48123   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
48124     return SDValue();
48125 
48126   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
48127   // are free.
48128   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
48129     return SDValue();
48130 
48131   // If this a zero extend to i64, we should only extend to i32 and use a free
48132   // zero extend to finish.
48133   EVT ExtendVT = TargetVT;
48134   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
48135     ExtendVT = MVT::i32;
48136 
48137   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
48138   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
48139 
48140   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
48141                             CMovN.getOperand(2), CMovN.getOperand(3));
48142 
48143   // Finish extending if needed.
48144   if (ExtendVT != TargetVT)
48145     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
48146 
48147   return Res;
48148 }
48149 
48150 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
48151 // This is more or less the reverse of combineBitcastvxi1.
48152 static SDValue
48153 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
48154                                TargetLowering::DAGCombinerInfo &DCI,
48155                                const X86Subtarget &Subtarget) {
48156   unsigned Opcode = N->getOpcode();
48157   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
48158       Opcode != ISD::ANY_EXTEND)
48159     return SDValue();
48160   if (!DCI.isBeforeLegalizeOps())
48161     return SDValue();
48162   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
48163     return SDValue();
48164 
48165   SDValue N0 = N->getOperand(0);
48166   EVT VT = N->getValueType(0);
48167   EVT SVT = VT.getScalarType();
48168   EVT InSVT = N0.getValueType().getScalarType();
48169   unsigned EltSizeInBits = SVT.getSizeInBits();
48170 
48171   // Input type must be extending a bool vector (bit-casted from a scalar
48172   // integer) to legal integer types.
48173   if (!VT.isVector())
48174     return SDValue();
48175   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
48176     return SDValue();
48177   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
48178     return SDValue();
48179 
48180   SDValue N00 = N0.getOperand(0);
48181   EVT SclVT = N0.getOperand(0).getValueType();
48182   if (!SclVT.isScalarInteger())
48183     return SDValue();
48184 
48185   SDLoc DL(N);
48186   SDValue Vec;
48187   SmallVector<int, 32> ShuffleMask;
48188   unsigned NumElts = VT.getVectorNumElements();
48189   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
48190 
48191   // Broadcast the scalar integer to the vector elements.
48192   if (NumElts > EltSizeInBits) {
48193     // If the scalar integer is greater than the vector element size, then we
48194     // must split it down into sub-sections for broadcasting. For example:
48195     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
48196     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
48197     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
48198     unsigned Scale = NumElts / EltSizeInBits;
48199     EVT BroadcastVT =
48200         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
48201     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48202     Vec = DAG.getBitcast(VT, Vec);
48203 
48204     for (unsigned i = 0; i != Scale; ++i)
48205       ShuffleMask.append(EltSizeInBits, i);
48206     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48207   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
48208              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
48209     // If we have register broadcast instructions, use the scalar size as the
48210     // element type for the shuffle. Then cast to the wider element type. The
48211     // widened bits won't be used, and this might allow the use of a broadcast
48212     // load.
48213     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
48214     unsigned Scale = EltSizeInBits / NumElts;
48215     EVT BroadcastVT =
48216         EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
48217     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48218     ShuffleMask.append(NumElts * Scale, 0);
48219     Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
48220     Vec = DAG.getBitcast(VT, Vec);
48221   } else {
48222     // For smaller scalar integers, we can simply any-extend it to the vector
48223     // element size (we don't care about the upper bits) and broadcast it to all
48224     // elements.
48225     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
48226     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
48227     ShuffleMask.append(NumElts, 0);
48228     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48229   }
48230 
48231   // Now, mask the relevant bit in each element.
48232   SmallVector<SDValue, 32> Bits;
48233   for (unsigned i = 0; i != NumElts; ++i) {
48234     int BitIdx = (i % EltSizeInBits);
48235     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
48236     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
48237   }
48238   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
48239   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
48240 
48241   // Compare against the bitmask and extend the result.
48242   EVT CCVT = VT.changeVectorElementType(MVT::i1);
48243   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
48244   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
48245 
48246   // For SEXT, this is now done, otherwise shift the result down for
48247   // zero-extension.
48248   if (Opcode == ISD::SIGN_EXTEND)
48249     return Vec;
48250   return DAG.getNode(ISD::SRL, DL, VT, Vec,
48251                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
48252 }
48253 
48254 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
48255 // result type.
48256 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
48257                                const X86Subtarget &Subtarget) {
48258   SDValue N0 = N->getOperand(0);
48259   EVT VT = N->getValueType(0);
48260   SDLoc dl(N);
48261 
48262   // Only do this combine with AVX512 for vector extends.
48263   if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
48264     return SDValue();
48265 
48266   // Only combine legal element types.
48267   EVT SVT = VT.getVectorElementType();
48268   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
48269       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
48270     return SDValue();
48271 
48272   // We can only do this if the vector size in 256 bits or less.
48273   unsigned Size = VT.getSizeInBits();
48274   if (Size > 256 && Subtarget.useAVX512Regs())
48275     return SDValue();
48276 
48277   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
48278   // that's the only integer compares with we have.
48279   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48280   if (ISD::isUnsignedIntSetCC(CC))
48281     return SDValue();
48282 
48283   // Only do this combine if the extension will be fully consumed by the setcc.
48284   EVT N00VT = N0.getOperand(0).getValueType();
48285   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
48286   if (Size != MatchingVecType.getSizeInBits())
48287     return SDValue();
48288 
48289   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
48290 
48291   if (N->getOpcode() == ISD::ZERO_EXTEND)
48292     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
48293 
48294   return Res;
48295 }
48296 
48297 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
48298                            TargetLowering::DAGCombinerInfo &DCI,
48299                            const X86Subtarget &Subtarget) {
48300   SDValue N0 = N->getOperand(0);
48301   EVT VT = N->getValueType(0);
48302   SDLoc DL(N);
48303 
48304   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48305   if (!DCI.isBeforeLegalizeOps() &&
48306       N0.getOpcode() == X86ISD::SETCC_CARRY) {
48307     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
48308                                  N0->getOperand(1));
48309     bool ReplaceOtherUses = !N0.hasOneUse();
48310     DCI.CombineTo(N, Setcc);
48311     // Replace other uses with a truncate of the widened setcc_carry.
48312     if (ReplaceOtherUses) {
48313       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48314                                   N0.getValueType(), Setcc);
48315       DCI.CombineTo(N0.getNode(), Trunc);
48316     }
48317 
48318     return SDValue(N, 0);
48319   }
48320 
48321   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48322     return NewCMov;
48323 
48324   if (!DCI.isBeforeLegalizeOps())
48325     return SDValue();
48326 
48327   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48328     return V;
48329 
48330   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48331     return V;
48332 
48333   if (VT.isVector()) {
48334     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48335       return R;
48336 
48337     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
48338       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
48339   }
48340 
48341   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48342     return NewAdd;
48343 
48344   return SDValue();
48345 }
48346 
48347 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
48348                           TargetLowering::DAGCombinerInfo &DCI,
48349                           const X86Subtarget &Subtarget) {
48350   SDLoc dl(N);
48351   EVT VT = N->getValueType(0);
48352   bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
48353 
48354   // Let legalize expand this if it isn't a legal type yet.
48355   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48356   if (!TLI.isTypeLegal(VT))
48357     return SDValue();
48358 
48359   SDValue A = N->getOperand(IsStrict ? 1 : 0);
48360   SDValue B = N->getOperand(IsStrict ? 2 : 1);
48361   SDValue C = N->getOperand(IsStrict ? 3 : 2);
48362 
48363   // If the operation allows fast-math and the target does not support FMA,
48364   // split this into mul+add to avoid libcall(s).
48365   SDNodeFlags Flags = N->getFlags();
48366   if (!IsStrict && Flags.hasAllowReassociation() &&
48367       TLI.isOperationExpand(ISD::FMA, VT)) {
48368     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
48369     return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
48370   }
48371 
48372   EVT ScalarVT = VT.getScalarType();
48373   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
48374     return SDValue();
48375 
48376   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
48377     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48378     bool LegalOperations = !DCI.isBeforeLegalizeOps();
48379     if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
48380                                                        CodeSize)) {
48381       V = NegV;
48382       return true;
48383     }
48384     // Look through extract_vector_elts. If it comes from an FNEG, create a
48385     // new extract from the FNEG input.
48386     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48387         isNullConstant(V.getOperand(1))) {
48388       SDValue Vec = V.getOperand(0);
48389       if (SDValue NegV = TLI.getCheaperNegatedExpression(
48390               Vec, DAG, LegalOperations, CodeSize)) {
48391         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
48392                         NegV, V.getOperand(1));
48393         return true;
48394       }
48395     }
48396 
48397     return false;
48398   };
48399 
48400   // Do not convert the passthru input of scalar intrinsics.
48401   // FIXME: We could allow negations of the lower element only.
48402   bool NegA = invertIfNegative(A);
48403   bool NegB = invertIfNegative(B);
48404   bool NegC = invertIfNegative(C);
48405 
48406   if (!NegA && !NegB && !NegC)
48407     return SDValue();
48408 
48409   unsigned NewOpcode =
48410       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
48411 
48412   // Propagate fast-math-flags to new FMA node.
48413   SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
48414   if (IsStrict) {
48415     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
48416     return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
48417                        {N->getOperand(0), A, B, C});
48418   } else {
48419     if (N->getNumOperands() == 4)
48420       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
48421     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
48422   }
48423 }
48424 
48425 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
48426 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
48427 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
48428                                TargetLowering::DAGCombinerInfo &DCI) {
48429   SDLoc dl(N);
48430   EVT VT = N->getValueType(0);
48431   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48432   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48433   bool LegalOperations = !DCI.isBeforeLegalizeOps();
48434 
48435   SDValue N2 = N->getOperand(2);
48436 
48437   SDValue NegN2 =
48438       TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
48439   if (!NegN2)
48440     return SDValue();
48441   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
48442 
48443   if (N->getNumOperands() == 4)
48444     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48445                        NegN2, N->getOperand(3));
48446   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48447                      NegN2);
48448 }
48449 
48450 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
48451                            TargetLowering::DAGCombinerInfo &DCI,
48452                            const X86Subtarget &Subtarget) {
48453   SDLoc dl(N);
48454   SDValue N0 = N->getOperand(0);
48455   EVT VT = N->getValueType(0);
48456 
48457   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48458   // FIXME: Is this needed? We don't seem to have any tests for it.
48459   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
48460       N0.getOpcode() == X86ISD::SETCC_CARRY) {
48461     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
48462                                  N0->getOperand(1));
48463     bool ReplaceOtherUses = !N0.hasOneUse();
48464     DCI.CombineTo(N, Setcc);
48465     // Replace other uses with a truncate of the widened setcc_carry.
48466     if (ReplaceOtherUses) {
48467       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48468                                   N0.getValueType(), Setcc);
48469       DCI.CombineTo(N0.getNode(), Trunc);
48470     }
48471 
48472     return SDValue(N, 0);
48473   }
48474 
48475   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48476     return NewCMov;
48477 
48478   if (DCI.isBeforeLegalizeOps())
48479     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48480       return V;
48481 
48482   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48483     return V;
48484 
48485   if (VT.isVector())
48486     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48487       return R;
48488 
48489   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48490     return NewAdd;
48491 
48492   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
48493     return R;
48494 
48495   // TODO: Combine with any target/faux shuffle.
48496   if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
48497       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
48498     SDValue N00 = N0.getOperand(0);
48499     SDValue N01 = N0.getOperand(1);
48500     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
48501     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
48502     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
48503         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
48504       return concatSubVectors(N00, N01, DAG, dl);
48505     }
48506   }
48507 
48508   return SDValue();
48509 }
48510 
48511 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
48512 /// recognizable memcmp expansion.
48513 static bool isOrXorXorTree(SDValue X, bool Root = true) {
48514   if (X.getOpcode() == ISD::OR)
48515     return isOrXorXorTree(X.getOperand(0), false) &&
48516            isOrXorXorTree(X.getOperand(1), false);
48517   if (Root)
48518     return false;
48519   return X.getOpcode() == ISD::XOR;
48520 }
48521 
48522 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
48523 /// expansion.
48524 template<typename F>
48525 static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
48526                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
48527   SDValue Op0 = X.getOperand(0);
48528   SDValue Op1 = X.getOperand(1);
48529   if (X.getOpcode() == ISD::OR) {
48530     SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48531     SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48532     if (VecVT != CmpVT)
48533       return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
48534     if (HasPT)
48535       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
48536     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
48537   } else if (X.getOpcode() == ISD::XOR) {
48538     SDValue A = SToV(Op0);
48539     SDValue B = SToV(Op1);
48540     if (VecVT != CmpVT)
48541       return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
48542     if (HasPT)
48543       return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
48544     return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
48545   }
48546   llvm_unreachable("Impossible");
48547 }
48548 
48549 /// Try to map a 128-bit or larger integer comparison to vector instructions
48550 /// before type legalization splits it up into chunks.
48551 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
48552                                                const X86Subtarget &Subtarget) {
48553   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
48554   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
48555 
48556   // We're looking for an oversized integer equality comparison.
48557   SDValue X = SetCC->getOperand(0);
48558   SDValue Y = SetCC->getOperand(1);
48559   EVT OpVT = X.getValueType();
48560   unsigned OpSize = OpVT.getSizeInBits();
48561   if (!OpVT.isScalarInteger() || OpSize < 128)
48562     return SDValue();
48563 
48564   // Ignore a comparison with zero because that gets special treatment in
48565   // EmitTest(). But make an exception for the special case of a pair of
48566   // logically-combined vector-sized operands compared to zero. This pattern may
48567   // be generated by the memcmp expansion pass with oversized integer compares
48568   // (see PR33325).
48569   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
48570   if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
48571     return SDValue();
48572 
48573   // Don't perform this combine if constructing the vector will be expensive.
48574   auto IsVectorBitCastCheap = [](SDValue X) {
48575     X = peekThroughBitcasts(X);
48576     return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
48577            X.getOpcode() == ISD::LOAD;
48578   };
48579   if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
48580       !IsOrXorXorTreeCCZero)
48581     return SDValue();
48582 
48583   EVT VT = SetCC->getValueType(0);
48584   SDLoc DL(SetCC);
48585 
48586   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
48587   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
48588   // Otherwise use PCMPEQ (plus AND) and mask testing.
48589   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
48590       (OpSize == 256 && Subtarget.hasAVX()) ||
48591       (OpSize == 512 && Subtarget.useAVX512Regs())) {
48592     bool HasPT = Subtarget.hasSSE41();
48593 
48594     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
48595     // vector registers are essentially free. (Technically, widening registers
48596     // prevents load folding, but the tradeoff is worth it.)
48597     bool PreferKOT = Subtarget.preferMaskRegisters();
48598     bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
48599 
48600     EVT VecVT = MVT::v16i8;
48601     EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
48602     if (OpSize == 256) {
48603       VecVT = MVT::v32i8;
48604       CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
48605     }
48606     EVT CastVT = VecVT;
48607     bool NeedsAVX512FCast = false;
48608     if (OpSize == 512 || NeedZExt) {
48609       if (Subtarget.hasBWI()) {
48610         VecVT = MVT::v64i8;
48611         CmpVT = MVT::v64i1;
48612         if (OpSize == 512)
48613           CastVT = VecVT;
48614       } else {
48615         VecVT = MVT::v16i32;
48616         CmpVT = MVT::v16i1;
48617         CastVT = OpSize == 512 ? VecVT :
48618                  OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
48619         NeedsAVX512FCast = true;
48620       }
48621     }
48622 
48623     auto ScalarToVector = [&](SDValue X) -> SDValue {
48624       bool TmpZext = false;
48625       EVT TmpCastVT = CastVT;
48626       if (X.getOpcode() == ISD::ZERO_EXTEND) {
48627         SDValue OrigX = X.getOperand(0);
48628         unsigned OrigSize = OrigX.getScalarValueSizeInBits();
48629         if (OrigSize < OpSize) {
48630           if (OrigSize == 128) {
48631             TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
48632             X = OrigX;
48633             TmpZext = true;
48634           } else if (OrigSize == 256) {
48635             TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
48636             X = OrigX;
48637             TmpZext = true;
48638           }
48639         }
48640       }
48641       X = DAG.getBitcast(TmpCastVT, X);
48642       if (!NeedZExt && !TmpZext)
48643         return X;
48644       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
48645                          DAG.getConstant(0, DL, VecVT), X,
48646                          DAG.getVectorIdxConstant(0, DL));
48647     };
48648 
48649     SDValue Cmp;
48650     if (IsOrXorXorTreeCCZero) {
48651       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
48652       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
48653       // Use 2 vector equality compares and 'and' the results before doing a
48654       // MOVMSK.
48655       Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
48656     } else {
48657       SDValue VecX = ScalarToVector(X);
48658       SDValue VecY = ScalarToVector(Y);
48659       if (VecVT != CmpVT) {
48660         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
48661       } else if (HasPT) {
48662         Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
48663       } else {
48664         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
48665       }
48666     }
48667     // AVX512 should emit a setcc that will lower to kortest.
48668     if (VecVT != CmpVT) {
48669       EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
48670                    CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
48671       return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
48672                           DAG.getConstant(0, DL, KRegVT), CC);
48673     }
48674     if (HasPT) {
48675       SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
48676                                      Cmp);
48677       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
48678       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
48679       SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
48680       return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
48681     }
48682     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
48683     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
48684     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
48685     assert(Cmp.getValueType() == MVT::v16i8 &&
48686            "Non 128-bit vector on pre-SSE41 target");
48687     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
48688     SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
48689     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
48690   }
48691 
48692   return SDValue();
48693 }
48694 
48695 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
48696                             TargetLowering::DAGCombinerInfo &DCI,
48697                             const X86Subtarget &Subtarget) {
48698   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
48699   const SDValue LHS = N->getOperand(0);
48700   const SDValue RHS = N->getOperand(1);
48701   EVT VT = N->getValueType(0);
48702   EVT OpVT = LHS.getValueType();
48703   SDLoc DL(N);
48704 
48705   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
48706     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
48707       return V;
48708 
48709     if (VT == MVT::i1 && isNullConstant(RHS)) {
48710       SDValue X86CC;
48711       if (SDValue V =
48712               MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
48713         return DAG.getNode(ISD::TRUNCATE, DL, VT,
48714                            DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
48715     }
48716 
48717     if (OpVT.isScalarInteger()) {
48718       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
48719       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
48720       auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
48721         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
48722           if (N0.getOperand(0) == N1)
48723             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48724                                N0.getOperand(1));
48725           if (N0.getOperand(1) == N1)
48726             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48727                                N0.getOperand(0));
48728         }
48729         return SDValue();
48730       };
48731       if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
48732         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48733       if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
48734         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48735 
48736       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
48737       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
48738       auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
48739         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
48740           if (N0.getOperand(0) == N1)
48741             return DAG.getNode(ISD::AND, DL, OpVT, N1,
48742                                DAG.getNOT(DL, N0.getOperand(1), OpVT));
48743           if (N0.getOperand(1) == N1)
48744             return DAG.getNode(ISD::AND, DL, OpVT, N1,
48745                                DAG.getNOT(DL, N0.getOperand(0), OpVT));
48746         }
48747         return SDValue();
48748       };
48749       if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
48750         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48751       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
48752         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48753 
48754       // cmpeq(trunc(x),0) --> cmpeq(x,0)
48755       // cmpne(trunc(x),0) --> cmpne(x,0)
48756       // iff x upper bits are zero.
48757       // TODO: Add support for RHS to be truncate as well?
48758       if (LHS.getOpcode() == ISD::TRUNCATE &&
48759           LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
48760           isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
48761         EVT SrcVT = LHS.getOperand(0).getValueType();
48762         APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
48763                                                 OpVT.getScalarSizeInBits());
48764         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48765         if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
48766             TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
48767           return DAG.getSetCC(DL, VT, LHS.getOperand(0),
48768                               DAG.getConstant(0, DL, SrcVT), CC);
48769       }
48770     }
48771   }
48772 
48773   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
48774       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
48775     // Using temporaries to avoid messing up operand ordering for later
48776     // transformations if this doesn't work.
48777     SDValue Op0 = LHS;
48778     SDValue Op1 = RHS;
48779     ISD::CondCode TmpCC = CC;
48780     // Put build_vector on the right.
48781     if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
48782       std::swap(Op0, Op1);
48783       TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
48784     }
48785 
48786     bool IsSEXT0 =
48787         (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
48788         (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
48789     bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
48790 
48791     if (IsSEXT0 && IsVZero1) {
48792       assert(VT == Op0.getOperand(0).getValueType() &&
48793              "Unexpected operand type");
48794       if (TmpCC == ISD::SETGT)
48795         return DAG.getConstant(0, DL, VT);
48796       if (TmpCC == ISD::SETLE)
48797         return DAG.getConstant(1, DL, VT);
48798       if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
48799         return DAG.getNOT(DL, Op0.getOperand(0), VT);
48800 
48801       assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
48802              "Unexpected condition code!");
48803       return Op0.getOperand(0);
48804     }
48805   }
48806 
48807   // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
48808   // pre-promote its result type since vXi1 vectors don't get promoted
48809   // during type legalization.
48810   // NOTE: The element count check is to ignore operand types that need to
48811   // go through type promotion to a 128-bit vector.
48812   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
48813       VT.getVectorElementType() == MVT::i1 &&
48814       (OpVT.getVectorElementType() == MVT::i8 ||
48815        OpVT.getVectorElementType() == MVT::i16)) {
48816     SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
48817     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
48818   }
48819 
48820   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
48821   // to avoid scalarization via legalization because v4i32 is not a legal type.
48822   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
48823       LHS.getValueType() == MVT::v4f32)
48824     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
48825 
48826   return SDValue();
48827 }
48828 
48829 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
48830                              TargetLowering::DAGCombinerInfo &DCI,
48831                              const X86Subtarget &Subtarget) {
48832   SDValue Src = N->getOperand(0);
48833   MVT SrcVT = Src.getSimpleValueType();
48834   MVT VT = N->getSimpleValueType(0);
48835   unsigned NumBits = VT.getScalarSizeInBits();
48836   unsigned NumElts = SrcVT.getVectorNumElements();
48837 
48838   // Perform constant folding.
48839   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
48840     assert(VT == MVT::i32 && "Unexpected result type");
48841     APInt Imm(32, 0);
48842     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
48843       if (!Src.getOperand(Idx).isUndef() &&
48844           Src.getConstantOperandAPInt(Idx).isNegative())
48845         Imm.setBit(Idx);
48846     }
48847     return DAG.getConstant(Imm, SDLoc(N), VT);
48848   }
48849 
48850   // Look through int->fp bitcasts that don't change the element width.
48851   unsigned EltWidth = SrcVT.getScalarSizeInBits();
48852   if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
48853       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
48854     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
48855 
48856   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
48857   // with scalar comparisons.
48858   if (SDValue NotSrc = IsNOT(Src, DAG)) {
48859     SDLoc DL(N);
48860     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48861     NotSrc = DAG.getBitcast(SrcVT, NotSrc);
48862     return DAG.getNode(ISD::XOR, DL, VT,
48863                        DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
48864                        DAG.getConstant(NotMask, DL, VT));
48865   }
48866 
48867   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
48868   // results with scalar comparisons.
48869   if (Src.getOpcode() == X86ISD::PCMPGT &&
48870       ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
48871     SDLoc DL(N);
48872     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48873     return DAG.getNode(ISD::XOR, DL, VT,
48874                        DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
48875                        DAG.getConstant(NotMask, DL, VT));
48876   }
48877 
48878   // Simplify the inputs.
48879   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48880   APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48881   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48882     return SDValue(N, 0);
48883 
48884   return SDValue();
48885 }
48886 
48887 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
48888                                        TargetLowering::DAGCombinerInfo &DCI) {
48889   // With vector masks we only demand the upper bit of the mask.
48890   SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
48891   if (Mask.getScalarValueSizeInBits() != 1) {
48892     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48893     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48894     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48895       if (N->getOpcode() != ISD::DELETED_NODE)
48896         DCI.AddToWorklist(N);
48897       return SDValue(N, 0);
48898     }
48899   }
48900 
48901   return SDValue();
48902 }
48903 
48904 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
48905                                     SDValue Index, SDValue Base, SDValue Scale,
48906                                     SelectionDAG &DAG) {
48907   SDLoc DL(GorS);
48908 
48909   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
48910     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
48911                       Gather->getMask(), Base, Index, Scale } ;
48912     return DAG.getMaskedGather(Gather->getVTList(),
48913                                Gather->getMemoryVT(), DL, Ops,
48914                                Gather->getMemOperand(),
48915                                Gather->getIndexType(),
48916                                Gather->getExtensionType());
48917   }
48918   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
48919   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
48920                     Scatter->getMask(), Base, Index, Scale };
48921   return DAG.getMaskedScatter(Scatter->getVTList(),
48922                               Scatter->getMemoryVT(), DL,
48923                               Ops, Scatter->getMemOperand(),
48924                               Scatter->getIndexType(),
48925                               Scatter->isTruncatingStore());
48926 }
48927 
48928 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
48929                                     TargetLowering::DAGCombinerInfo &DCI) {
48930   SDLoc DL(N);
48931   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
48932   SDValue Index = GorS->getIndex();
48933   SDValue Base = GorS->getBasePtr();
48934   SDValue Scale = GorS->getScale();
48935 
48936   if (DCI.isBeforeLegalize()) {
48937     unsigned IndexWidth = Index.getScalarValueSizeInBits();
48938 
48939     // Shrink constant indices if they are larger than 32-bits.
48940     // Only do this before legalize types since v2i64 could become v2i32.
48941     // FIXME: We could check that the type is legal if we're after legalize
48942     // types, but then we would need to construct test cases where that happens.
48943     // FIXME: We could support more than just constant vectors, but we need to
48944     // careful with costing. A truncate that can be optimized out would be fine.
48945     // Otherwise we might only want to create a truncate if it avoids a split.
48946     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
48947       if (BV->isConstant() && IndexWidth > 32 &&
48948           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48949         EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48950         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48951         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48952       }
48953     }
48954 
48955     // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
48956     // there are sufficient sign bits. Only do this before legalize types to
48957     // avoid creating illegal types in truncate.
48958     if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
48959          Index.getOpcode() == ISD::ZERO_EXTEND) &&
48960         IndexWidth > 32 &&
48961         Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
48962         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48963       EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48964       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48965       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48966     }
48967   }
48968 
48969   if (DCI.isBeforeLegalizeOps()) {
48970     unsigned IndexWidth = Index.getScalarValueSizeInBits();
48971 
48972     // Make sure the index is either i32 or i64
48973     if (IndexWidth != 32 && IndexWidth != 64) {
48974       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
48975       EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
48976       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
48977       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48978     }
48979   }
48980 
48981   // With vector masks we only demand the upper bit of the mask.
48982   SDValue Mask = GorS->getMask();
48983   if (Mask.getScalarValueSizeInBits() != 1) {
48984     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48985     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48986     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48987       if (N->getOpcode() != ISD::DELETED_NODE)
48988         DCI.AddToWorklist(N);
48989       return SDValue(N, 0);
48990     }
48991   }
48992 
48993   return SDValue();
48994 }
48995 
48996 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
48997 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
48998                                const X86Subtarget &Subtarget) {
48999   SDLoc DL(N);
49000   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
49001   SDValue EFLAGS = N->getOperand(1);
49002 
49003   // Try to simplify the EFLAGS and condition code operands.
49004   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
49005     return getSETCC(CC, Flags, DL, DAG);
49006 
49007   return SDValue();
49008 }
49009 
49010 /// Optimize branch condition evaluation.
49011 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
49012                              const X86Subtarget &Subtarget) {
49013   SDLoc DL(N);
49014   SDValue EFLAGS = N->getOperand(3);
49015   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
49016 
49017   // Try to simplify the EFLAGS and condition code operands.
49018   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
49019   // RAUW them under us.
49020   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
49021     SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
49022     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
49023                        N->getOperand(1), Cond, Flags);
49024   }
49025 
49026   return SDValue();
49027 }
49028 
49029 // TODO: Could we move this to DAGCombine?
49030 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
49031                                                   SelectionDAG &DAG) {
49032   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
49033   // to optimize away operation when it's from a constant.
49034   //
49035   // The general transformation is:
49036   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
49037   //       AND(VECTOR_CMP(x,y), constant2)
49038   //    constant2 = UNARYOP(constant)
49039 
49040   // Early exit if this isn't a vector operation, the operand of the
49041   // unary operation isn't a bitwise AND, or if the sizes of the operations
49042   // aren't the same.
49043   EVT VT = N->getValueType(0);
49044   bool IsStrict = N->isStrictFPOpcode();
49045   unsigned NumEltBits = VT.getScalarSizeInBits();
49046   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49047   if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
49048       DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
49049       VT.getSizeInBits() != Op0.getValueSizeInBits())
49050     return SDValue();
49051 
49052   // Now check that the other operand of the AND is a constant. We could
49053   // make the transformation for non-constant splats as well, but it's unclear
49054   // that would be a benefit as it would not eliminate any operations, just
49055   // perform one more step in scalar code before moving to the vector unit.
49056   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
49057     // Bail out if the vector isn't a constant.
49058     if (!BV->isConstant())
49059       return SDValue();
49060 
49061     // Everything checks out. Build up the new and improved node.
49062     SDLoc DL(N);
49063     EVT IntVT = BV->getValueType(0);
49064     // Create a new constant of the appropriate type for the transformed
49065     // DAG.
49066     SDValue SourceConst;
49067     if (IsStrict)
49068       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
49069                                 {N->getOperand(0), SDValue(BV, 0)});
49070     else
49071       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
49072     // The AND node needs bitcasts to/from an integer vector type around it.
49073     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
49074     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
49075                                  MaskConst);
49076     SDValue Res = DAG.getBitcast(VT, NewAnd);
49077     if (IsStrict)
49078       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
49079     return Res;
49080   }
49081 
49082   return SDValue();
49083 }
49084 
49085 /// If we are converting a value to floating-point, try to replace scalar
49086 /// truncate of an extracted vector element with a bitcast. This tries to keep
49087 /// the sequence on XMM registers rather than moving between vector and GPRs.
49088 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
49089   // TODO: This is currently only used by combineSIntToFP, but it is generalized
49090   //       to allow being called by any similar cast opcode.
49091   // TODO: Consider merging this into lowering: vectorizeExtractedCast().
49092   SDValue Trunc = N->getOperand(0);
49093   if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
49094     return SDValue();
49095 
49096   SDValue ExtElt = Trunc.getOperand(0);
49097   if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49098       !isNullConstant(ExtElt.getOperand(1)))
49099     return SDValue();
49100 
49101   EVT TruncVT = Trunc.getValueType();
49102   EVT SrcVT = ExtElt.getValueType();
49103   unsigned DestWidth = TruncVT.getSizeInBits();
49104   unsigned SrcWidth = SrcVT.getSizeInBits();
49105   if (SrcWidth % DestWidth != 0)
49106     return SDValue();
49107 
49108   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
49109   EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
49110   unsigned VecWidth = SrcVecVT.getSizeInBits();
49111   unsigned NumElts = VecWidth / DestWidth;
49112   EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
49113   SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
49114   SDLoc DL(N);
49115   SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
49116                                   BitcastVec, ExtElt.getOperand(1));
49117   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
49118 }
49119 
49120 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
49121                                const X86Subtarget &Subtarget) {
49122   bool IsStrict = N->isStrictFPOpcode();
49123   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49124   EVT VT = N->getValueType(0);
49125   EVT InVT = Op0.getValueType();
49126 
49127   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
49128   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
49129   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
49130   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49131     SDLoc dl(N);
49132     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49133     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49134 
49135     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
49136     if (IsStrict)
49137       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49138                          {N->getOperand(0), P});
49139     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49140   }
49141 
49142   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
49143   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
49144   // the optimization here.
49145   if (DAG.SignBitIsZero(Op0)) {
49146     if (IsStrict)
49147       return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
49148                          {N->getOperand(0), Op0});
49149     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
49150   }
49151 
49152   return SDValue();
49153 }
49154 
49155 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
49156                                TargetLowering::DAGCombinerInfo &DCI,
49157                                const X86Subtarget &Subtarget) {
49158   // First try to optimize away the conversion entirely when it's
49159   // conditionally from a constant. Vectors only.
49160   bool IsStrict = N->isStrictFPOpcode();
49161   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
49162     return Res;
49163 
49164   // Now move on to more general possibilities.
49165   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49166   EVT VT = N->getValueType(0);
49167   EVT InVT = Op0.getValueType();
49168 
49169   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
49170   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
49171   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
49172   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49173     SDLoc dl(N);
49174     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49175     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
49176     if (IsStrict)
49177       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49178                          {N->getOperand(0), P});
49179     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49180   }
49181 
49182   // Without AVX512DQ we only support i64 to float scalar conversion. For both
49183   // vectors and scalars, see if we know that the upper bits are all the sign
49184   // bit, in which case we can truncate the input to i32 and convert from that.
49185   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
49186     unsigned BitWidth = InVT.getScalarSizeInBits();
49187     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
49188     if (NumSignBits >= (BitWidth - 31)) {
49189       EVT TruncVT = MVT::i32;
49190       if (InVT.isVector())
49191         TruncVT = InVT.changeVectorElementType(TruncVT);
49192       SDLoc dl(N);
49193       if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
49194         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
49195         if (IsStrict)
49196           return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49197                              {N->getOperand(0), Trunc});
49198         return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
49199       }
49200       // If we're after legalize and the type is v2i32 we need to shuffle and
49201       // use CVTSI2P.
49202       assert(InVT == MVT::v2i64 && "Unexpected VT!");
49203       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
49204       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
49205                                           { 0, 2, -1, -1 });
49206       if (IsStrict)
49207         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
49208                            {N->getOperand(0), Shuf});
49209       return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
49210     }
49211   }
49212 
49213   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
49214   // a 32-bit target where SSE doesn't support i64->FP operations.
49215   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
49216       Op0.getOpcode() == ISD::LOAD) {
49217     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
49218 
49219     // This transformation is not supported if the result type is f16 or f128.
49220     if (VT == MVT::f16 || VT == MVT::f128)
49221       return SDValue();
49222 
49223     // If we have AVX512DQ we can use packed conversion instructions unless
49224     // the VT is f80.
49225     if (Subtarget.hasDQI() && VT != MVT::f80)
49226       return SDValue();
49227 
49228     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
49229         Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
49230       std::pair<SDValue, SDValue> Tmp =
49231           Subtarget.getTargetLowering()->BuildFILD(
49232               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
49233               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
49234       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
49235       return Tmp.first;
49236     }
49237   }
49238 
49239   if (IsStrict)
49240     return SDValue();
49241 
49242   if (SDValue V = combineToFPTruncExtElt(N, DAG))
49243     return V;
49244 
49245   return SDValue();
49246 }
49247 
49248 static bool needCarryOrOverflowFlag(SDValue Flags) {
49249   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
49250 
49251   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49252          UI != UE; ++UI) {
49253     SDNode *User = *UI;
49254 
49255     X86::CondCode CC;
49256     switch (User->getOpcode()) {
49257     default:
49258       // Be conservative.
49259       return true;
49260     case X86ISD::SETCC:
49261     case X86ISD::SETCC_CARRY:
49262       CC = (X86::CondCode)User->getConstantOperandVal(0);
49263       break;
49264     case X86ISD::BRCOND:
49265       CC = (X86::CondCode)User->getConstantOperandVal(2);
49266       break;
49267     case X86ISD::CMOV:
49268       CC = (X86::CondCode)User->getConstantOperandVal(2);
49269       break;
49270     }
49271 
49272     switch (CC) {
49273     default: break;
49274     case X86::COND_A: case X86::COND_AE:
49275     case X86::COND_B: case X86::COND_BE:
49276     case X86::COND_O: case X86::COND_NO:
49277     case X86::COND_G: case X86::COND_GE:
49278     case X86::COND_L: case X86::COND_LE:
49279       return true;
49280     }
49281   }
49282 
49283   return false;
49284 }
49285 
49286 static bool onlyZeroFlagUsed(SDValue Flags) {
49287   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
49288 
49289   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49290          UI != UE; ++UI) {
49291     SDNode *User = *UI;
49292 
49293     unsigned CCOpNo;
49294     switch (User->getOpcode()) {
49295     default:
49296       // Be conservative.
49297       return false;
49298     case X86ISD::SETCC:       CCOpNo = 0; break;
49299     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
49300     case X86ISD::BRCOND:      CCOpNo = 2; break;
49301     case X86ISD::CMOV:        CCOpNo = 2; break;
49302     }
49303 
49304     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
49305     if (CC != X86::COND_E && CC != X86::COND_NE)
49306       return false;
49307   }
49308 
49309   return true;
49310 }
49311 
49312 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
49313   // Only handle test patterns.
49314   if (!isNullConstant(N->getOperand(1)))
49315     return SDValue();
49316 
49317   // If we have a CMP of a truncated binop, see if we can make a smaller binop
49318   // and use its flags directly.
49319   // TODO: Maybe we should try promoting compares that only use the zero flag
49320   // first if we can prove the upper bits with computeKnownBits?
49321   SDLoc dl(N);
49322   SDValue Op = N->getOperand(0);
49323   EVT VT = Op.getValueType();
49324 
49325   // If we have a constant logical shift that's only used in a comparison
49326   // against zero turn it into an equivalent AND. This allows turning it into
49327   // a TEST instruction later.
49328   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
49329       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
49330       onlyZeroFlagUsed(SDValue(N, 0))) {
49331     unsigned BitWidth = VT.getSizeInBits();
49332     const APInt &ShAmt = Op.getConstantOperandAPInt(1);
49333     if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
49334       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
49335       APInt Mask = Op.getOpcode() == ISD::SRL
49336                        ? APInt::getHighBitsSet(BitWidth, MaskBits)
49337                        : APInt::getLowBitsSet(BitWidth, MaskBits);
49338       if (Mask.isSignedIntN(32)) {
49339         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
49340                          DAG.getConstant(Mask, dl, VT));
49341         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49342                            DAG.getConstant(0, dl, VT));
49343       }
49344     }
49345   }
49346 
49347   // Look for a truncate.
49348   if (Op.getOpcode() != ISD::TRUNCATE)
49349     return SDValue();
49350 
49351   SDValue Trunc = Op;
49352   Op = Op.getOperand(0);
49353 
49354   // See if we can compare with zero against the truncation source,
49355   // which should help using the Z flag from many ops. Only do this for
49356   // i32 truncated op to prevent partial-reg compares of promoted ops.
49357   EVT OpVT = Op.getValueType();
49358   APInt UpperBits =
49359       APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
49360   if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
49361       onlyZeroFlagUsed(SDValue(N, 0))) {
49362     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49363                        DAG.getConstant(0, dl, OpVT));
49364   }
49365 
49366   // After this the truncate and arithmetic op must have a single use.
49367   if (!Trunc.hasOneUse() || !Op.hasOneUse())
49368       return SDValue();
49369 
49370   unsigned NewOpc;
49371   switch (Op.getOpcode()) {
49372   default: return SDValue();
49373   case ISD::AND:
49374     // Skip and with constant. We have special handling for and with immediate
49375     // during isel to generate test instructions.
49376     if (isa<ConstantSDNode>(Op.getOperand(1)))
49377       return SDValue();
49378     NewOpc = X86ISD::AND;
49379     break;
49380   case ISD::OR:  NewOpc = X86ISD::OR;  break;
49381   case ISD::XOR: NewOpc = X86ISD::XOR; break;
49382   case ISD::ADD:
49383     // If the carry or overflow flag is used, we can't truncate.
49384     if (needCarryOrOverflowFlag(SDValue(N, 0)))
49385       return SDValue();
49386     NewOpc = X86ISD::ADD;
49387     break;
49388   case ISD::SUB:
49389     // If the carry or overflow flag is used, we can't truncate.
49390     if (needCarryOrOverflowFlag(SDValue(N, 0)))
49391       return SDValue();
49392     NewOpc = X86ISD::SUB;
49393     break;
49394   }
49395 
49396   // We found an op we can narrow. Truncate its inputs.
49397   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
49398   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
49399 
49400   // Use a X86 specific opcode to avoid DAG combine messing with it.
49401   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49402   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
49403 
49404   // For AND, keep a CMP so that we can match the test pattern.
49405   if (NewOpc == X86ISD::AND)
49406     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49407                        DAG.getConstant(0, dl, VT));
49408 
49409   // Return the flags.
49410   return Op.getValue(1);
49411 }
49412 
49413 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
49414                                 TargetLowering::DAGCombinerInfo &DCI) {
49415   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
49416          "Expected X86ISD::ADD or X86ISD::SUB");
49417 
49418   SDLoc DL(N);
49419   SDValue LHS = N->getOperand(0);
49420   SDValue RHS = N->getOperand(1);
49421   MVT VT = LHS.getSimpleValueType();
49422   unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
49423 
49424   // If we don't use the flag result, simplify back to a generic ADD/SUB.
49425   if (!N->hasAnyUseOfValue(1)) {
49426     SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
49427     return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
49428   }
49429 
49430   // Fold any similar generic ADD/SUB opcodes to reuse this node.
49431   auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
49432     SDValue Ops[] = {N0, N1};
49433     SDVTList VTs = DAG.getVTList(N->getValueType(0));
49434     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
49435       SDValue Op(N, 0);
49436       if (Negate)
49437         Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
49438       DCI.CombineTo(GenericAddSub, Op);
49439     }
49440   };
49441   MatchGeneric(LHS, RHS, false);
49442   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
49443 
49444   return SDValue();
49445 }
49446 
49447 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
49448   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49449     MVT VT = N->getSimpleValueType(0);
49450     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49451     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
49452                        N->getOperand(0), N->getOperand(1),
49453                        Flags);
49454   }
49455 
49456   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
49457   // iff the flag result is dead.
49458   SDValue Op0 = N->getOperand(0);
49459   SDValue Op1 = N->getOperand(1);
49460   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
49461       !N->hasAnyUseOfValue(1))
49462     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
49463                        Op0.getOperand(1), N->getOperand(2));
49464 
49465   return SDValue();
49466 }
49467 
49468 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
49469 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
49470                           TargetLowering::DAGCombinerInfo &DCI) {
49471   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
49472   // the result is either zero or one (depending on the input carry bit).
49473   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
49474   if (X86::isZeroNode(N->getOperand(0)) &&
49475       X86::isZeroNode(N->getOperand(1)) &&
49476       // We don't have a good way to replace an EFLAGS use, so only do this when
49477       // dead right now.
49478       SDValue(N, 1).use_empty()) {
49479     SDLoc DL(N);
49480     EVT VT = N->getValueType(0);
49481     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
49482     SDValue Res1 =
49483         DAG.getNode(ISD::AND, DL, VT,
49484                     DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49485                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49486                                 N->getOperand(2)),
49487                     DAG.getConstant(1, DL, VT));
49488     return DCI.CombineTo(N, Res1, CarryOut);
49489   }
49490 
49491   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49492     MVT VT = N->getSimpleValueType(0);
49493     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49494     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
49495                        N->getOperand(0), N->getOperand(1),
49496                        Flags);
49497   }
49498 
49499   return SDValue();
49500 }
49501 
49502 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
49503 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49504 /// with CMP+{ADC, SBB}.
49505 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49506   bool IsSub = N->getOpcode() == ISD::SUB;
49507   SDValue X = N->getOperand(0);
49508   SDValue Y = N->getOperand(1);
49509 
49510   // If this is an add, canonicalize a zext operand to the RHS.
49511   // TODO: Incomplete? What if both sides are zexts?
49512   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
49513       Y.getOpcode() != ISD::ZERO_EXTEND)
49514     std::swap(X, Y);
49515 
49516   // Look through a one-use zext.
49517   bool PeekedThroughZext = false;
49518   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
49519     Y = Y.getOperand(0);
49520     PeekedThroughZext = true;
49521   }
49522 
49523   // If this is an add, canonicalize a setcc operand to the RHS.
49524   // TODO: Incomplete? What if both sides are setcc?
49525   // TODO: Should we allow peeking through a zext of the other operand?
49526   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
49527       Y.getOpcode() != X86ISD::SETCC)
49528     std::swap(X, Y);
49529 
49530   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
49531     return SDValue();
49532 
49533   SDLoc DL(N);
49534   EVT VT = N->getValueType(0);
49535   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
49536 
49537   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49538   // the general case below.
49539   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49540   if (ConstantX) {
49541     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
49542         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
49543       // This is a complicated way to get -1 or 0 from the carry flag:
49544       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49545       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49546       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49547                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49548                          Y.getOperand(1));
49549     }
49550 
49551     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
49552         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
49553       SDValue EFLAGS = Y->getOperand(1);
49554       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49555           EFLAGS.getValueType().isInteger() &&
49556           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49557         // Swap the operands of a SUB, and we have the same pattern as above.
49558         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49559         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
49560         SDValue NewSub = DAG.getNode(
49561             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49562             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49563         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49564         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49565                            DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49566                            NewEFLAGS);
49567       }
49568     }
49569   }
49570 
49571   if (CC == X86::COND_B) {
49572     // X + SETB Z --> adc X, 0
49573     // X - SETB Z --> sbb X, 0
49574     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49575                        DAG.getVTList(VT, MVT::i32), X,
49576                        DAG.getConstant(0, DL, VT), Y.getOperand(1));
49577   }
49578 
49579   if (CC == X86::COND_A) {
49580     SDValue EFLAGS = Y.getOperand(1);
49581     // Try to convert COND_A into COND_B in an attempt to facilitate
49582     // materializing "setb reg".
49583     //
49584     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49585     // cannot take an immediate as its first operand.
49586     //
49587     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49588         EFLAGS.getValueType().isInteger() &&
49589         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49590       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
49591                                    EFLAGS.getNode()->getVTList(),
49592                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49593       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49594       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49595                          DAG.getVTList(VT, MVT::i32), X,
49596                          DAG.getConstant(0, DL, VT), NewEFLAGS);
49597     }
49598   }
49599 
49600   if (CC == X86::COND_AE) {
49601     // X + SETAE --> sbb X, -1
49602     // X - SETAE --> adc X, -1
49603     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49604                        DAG.getVTList(VT, MVT::i32), X,
49605                        DAG.getConstant(-1, DL, VT), Y.getOperand(1));
49606   }
49607 
49608   if (CC == X86::COND_BE) {
49609     // X + SETBE --> sbb X, -1
49610     // X - SETBE --> adc X, -1
49611     SDValue EFLAGS = Y.getOperand(1);
49612     // Try to convert COND_BE into COND_AE in an attempt to facilitate
49613     // materializing "setae reg".
49614     //
49615     // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49616     // cannot take an immediate as its first operand.
49617     //
49618     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49619         EFLAGS.getValueType().isInteger() &&
49620         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49621       SDValue NewSub = DAG.getNode(
49622           X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49623           EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49624       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49625       return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49626                          DAG.getVTList(VT, MVT::i32), X,
49627                          DAG.getConstant(-1, DL, VT), NewEFLAGS);
49628     }
49629   }
49630 
49631   if (CC != X86::COND_E && CC != X86::COND_NE)
49632     return SDValue();
49633 
49634   SDValue Cmp = Y.getOperand(1);
49635   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
49636       !X86::isZeroNode(Cmp.getOperand(1)) ||
49637       !Cmp.getOperand(0).getValueType().isInteger())
49638     return SDValue();
49639 
49640   SDValue Z = Cmp.getOperand(0);
49641   EVT ZVT = Z.getValueType();
49642 
49643   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49644   // the general case below.
49645   if (ConstantX) {
49646     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49647     // fake operands:
49648     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49649     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49650     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
49651         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
49652       SDValue Zero = DAG.getConstant(0, DL, ZVT);
49653       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49654       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49655       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49656                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49657                          SDValue(Neg.getNode(), 1));
49658     }
49659 
49660     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49661     // with fake operands:
49662     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49663     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49664     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
49665         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
49666       SDValue One = DAG.getConstant(1, DL, ZVT);
49667       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49668       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49669       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49670                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49671                          Cmp1.getValue(1));
49672     }
49673   }
49674 
49675   // (cmp Z, 1) sets the carry flag if Z is 0.
49676   SDValue One = DAG.getConstant(1, DL, ZVT);
49677   SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49678   SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49679 
49680   // Add the flags type for ADC/SBB nodes.
49681   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49682 
49683   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49684   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49685   if (CC == X86::COND_NE)
49686     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49687                        DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49688 
49689   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
49690   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
49691   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49692                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49693 }
49694 
49695 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
49696                             const SDLoc &DL, EVT VT,
49697                             const X86Subtarget &Subtarget) {
49698   // Example of pattern we try to detect:
49699   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
49700   //(add (build_vector (extract_elt t, 0),
49701   //                   (extract_elt t, 2),
49702   //                   (extract_elt t, 4),
49703   //                   (extract_elt t, 6)),
49704   //     (build_vector (extract_elt t, 1),
49705   //                   (extract_elt t, 3),
49706   //                   (extract_elt t, 5),
49707   //                   (extract_elt t, 7)))
49708 
49709   if (!Subtarget.hasSSE2())
49710     return SDValue();
49711 
49712   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
49713       Op1.getOpcode() != ISD::BUILD_VECTOR)
49714     return SDValue();
49715 
49716   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49717       VT.getVectorNumElements() < 4 ||
49718       !isPowerOf2_32(VT.getVectorNumElements()))
49719     return SDValue();
49720 
49721   // Check if one of Op0,Op1 is of the form:
49722   // (build_vector (extract_elt Mul, 0),
49723   //               (extract_elt Mul, 2),
49724   //               (extract_elt Mul, 4),
49725   //                   ...
49726   // the other is of the form:
49727   // (build_vector (extract_elt Mul, 1),
49728   //               (extract_elt Mul, 3),
49729   //               (extract_elt Mul, 5),
49730   //                   ...
49731   // and identify Mul.
49732   SDValue Mul;
49733   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
49734     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
49735             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
49736     // TODO: Be more tolerant to undefs.
49737     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49738         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49739         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49740         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49741       return SDValue();
49742     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
49743     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
49744     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
49745     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
49746     if (!Const0L || !Const1L || !Const0H || !Const1H)
49747       return SDValue();
49748     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
49749              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
49750     // Commutativity of mul allows factors of a product to reorder.
49751     if (Idx0L > Idx1L)
49752       std::swap(Idx0L, Idx1L);
49753     if (Idx0H > Idx1H)
49754       std::swap(Idx0H, Idx1H);
49755     // Commutativity of add allows pairs of factors to reorder.
49756     if (Idx0L > Idx0H) {
49757       std::swap(Idx0L, Idx0H);
49758       std::swap(Idx1L, Idx1H);
49759     }
49760     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
49761         Idx1H != 2 * i + 3)
49762       return SDValue();
49763     if (!Mul) {
49764       // First time an extract_elt's source vector is visited. Must be a MUL
49765       // with 2X number of vector elements than the BUILD_VECTOR.
49766       // Both extracts must be from same MUL.
49767       Mul = Op0L->getOperand(0);
49768       if (Mul->getOpcode() != ISD::MUL ||
49769           Mul.getValueType().getVectorNumElements() != 2 * e)
49770         return SDValue();
49771     }
49772     // Check that the extract is from the same MUL previously seen.
49773     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
49774         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
49775       return SDValue();
49776   }
49777 
49778   // Check if the Mul source can be safely shrunk.
49779   ShrinkMode Mode;
49780   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
49781       Mode == ShrinkMode::MULU16)
49782     return SDValue();
49783 
49784   EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49785                                  VT.getVectorNumElements() * 2);
49786   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
49787   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
49788 
49789   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49790                          ArrayRef<SDValue> Ops) {
49791     EVT InVT = Ops[0].getValueType();
49792     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
49793     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49794                                  InVT.getVectorNumElements() / 2);
49795     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49796   };
49797   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
49798 }
49799 
49800 // Attempt to turn this pattern into PMADDWD.
49801 // (add (mul (sext (build_vector)), (sext (build_vector))),
49802 //      (mul (sext (build_vector)), (sext (build_vector)))
49803 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
49804                               const SDLoc &DL, EVT VT,
49805                               const X86Subtarget &Subtarget) {
49806   if (!Subtarget.hasSSE2())
49807     return SDValue();
49808 
49809   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49810     return SDValue();
49811 
49812   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49813       VT.getVectorNumElements() < 4 ||
49814       !isPowerOf2_32(VT.getVectorNumElements()))
49815     return SDValue();
49816 
49817   SDValue N00 = N0.getOperand(0);
49818   SDValue N01 = N0.getOperand(1);
49819   SDValue N10 = N1.getOperand(0);
49820   SDValue N11 = N1.getOperand(1);
49821 
49822   // All inputs need to be sign extends.
49823   // TODO: Support ZERO_EXTEND from known positive?
49824   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
49825       N01.getOpcode() != ISD::SIGN_EXTEND ||
49826       N10.getOpcode() != ISD::SIGN_EXTEND ||
49827       N11.getOpcode() != ISD::SIGN_EXTEND)
49828     return SDValue();
49829 
49830   // Peek through the extends.
49831   N00 = N00.getOperand(0);
49832   N01 = N01.getOperand(0);
49833   N10 = N10.getOperand(0);
49834   N11 = N11.getOperand(0);
49835 
49836   // Must be extending from vXi16.
49837   EVT InVT = N00.getValueType();
49838   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
49839       N10.getValueType() != InVT || N11.getValueType() != InVT)
49840     return SDValue();
49841 
49842   // All inputs should be build_vectors.
49843   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49844       N01.getOpcode() != ISD::BUILD_VECTOR ||
49845       N10.getOpcode() != ISD::BUILD_VECTOR ||
49846       N11.getOpcode() != ISD::BUILD_VECTOR)
49847     return SDValue();
49848 
49849   // For each element, we need to ensure we have an odd element from one vector
49850   // multiplied by the odd element of another vector and the even element from
49851   // one of the same vectors being multiplied by the even element from the
49852   // other vector. So we need to make sure for each element i, this operator
49853   // is being performed:
49854   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49855   SDValue In0, In1;
49856   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
49857     SDValue N00Elt = N00.getOperand(i);
49858     SDValue N01Elt = N01.getOperand(i);
49859     SDValue N10Elt = N10.getOperand(i);
49860     SDValue N11Elt = N11.getOperand(i);
49861     // TODO: Be more tolerant to undefs.
49862     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49863         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49864         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49865         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49866       return SDValue();
49867     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49868     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49869     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49870     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49871     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49872       return SDValue();
49873     unsigned IdxN00 = ConstN00Elt->getZExtValue();
49874     unsigned IdxN01 = ConstN01Elt->getZExtValue();
49875     unsigned IdxN10 = ConstN10Elt->getZExtValue();
49876     unsigned IdxN11 = ConstN11Elt->getZExtValue();
49877     // Add is commutative so indices can be reordered.
49878     if (IdxN00 > IdxN10) {
49879       std::swap(IdxN00, IdxN10);
49880       std::swap(IdxN01, IdxN11);
49881     }
49882     // N0 indices be the even element. N1 indices must be the next odd element.
49883     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49884         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49885       return SDValue();
49886     SDValue N00In = N00Elt.getOperand(0);
49887     SDValue N01In = N01Elt.getOperand(0);
49888     SDValue N10In = N10Elt.getOperand(0);
49889     SDValue N11In = N11Elt.getOperand(0);
49890 
49891     // First time we find an input capture it.
49892     if (!In0) {
49893       In0 = N00In;
49894       In1 = N01In;
49895 
49896       // The input vectors must be at least as wide as the output.
49897       // If they are larger than the output, we extract subvector below.
49898       if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
49899           In1.getValueSizeInBits() < VT.getSizeInBits())
49900         return SDValue();
49901     }
49902     // Mul is commutative so the input vectors can be in any order.
49903     // Canonicalize to make the compares easier.
49904     if (In0 != N00In)
49905       std::swap(N00In, N01In);
49906     if (In0 != N10In)
49907       std::swap(N10In, N11In);
49908     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
49909       return SDValue();
49910   }
49911 
49912   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49913                          ArrayRef<SDValue> Ops) {
49914     EVT OpVT = Ops[0].getValueType();
49915     assert(OpVT.getScalarType() == MVT::i16 &&
49916            "Unexpected scalar element type");
49917     assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
49918     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49919                                  OpVT.getVectorNumElements() / 2);
49920     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49921   };
49922 
49923   // If the output is narrower than an input, extract the low part of the input
49924   // vector.
49925   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49926                                VT.getVectorNumElements() * 2);
49927   if (OutVT16.bitsLT(In0.getValueType())) {
49928     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
49929                       DAG.getIntPtrConstant(0, DL));
49930   }
49931   if (OutVT16.bitsLT(In1.getValueType())) {
49932     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
49933                       DAG.getIntPtrConstant(0, DL));
49934   }
49935   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
49936                           PMADDBuilder);
49937 }
49938 
49939 /// CMOV of constants requires materializing constant operands in registers.
49940 /// Try to fold those constants into an 'add' instruction to reduce instruction
49941 /// count. We do this with CMOV rather the generic 'select' because there are
49942 /// earlier folds that may be used to turn select-of-constants into logic hacks.
49943 static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
49944   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
49945   // better because we eliminate 1-2 instructions. This transform is still
49946   // an improvement without zero operands because we trade 2 move constants and
49947   // 1 add for 2 adds (LEA) as long as the constants can be represented as
49948   // immediate asm operands (fit in 32-bits).
49949   auto isSuitableCmov = [](SDValue V) {
49950     if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
49951       return false;
49952     if (!isa<ConstantSDNode>(V.getOperand(0)) ||
49953         !isa<ConstantSDNode>(V.getOperand(1)))
49954       return false;
49955     return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
49956            (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
49957             V.getConstantOperandAPInt(1).isSignedIntN(32));
49958   };
49959 
49960   // Match an appropriate CMOV as the first operand of the add.
49961   SDValue Cmov = N->getOperand(0);
49962   SDValue OtherOp = N->getOperand(1);
49963   if (!isSuitableCmov(Cmov))
49964     std::swap(Cmov, OtherOp);
49965   if (!isSuitableCmov(Cmov))
49966     return SDValue();
49967 
49968   // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
49969   EVT VT = N->getValueType(0);
49970   SDLoc DL(N);
49971   SDValue FalseOp = Cmov.getOperand(0);
49972   SDValue TrueOp = Cmov.getOperand(1);
49973   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
49974   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
49975   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
49976                      Cmov.getOperand(3));
49977 }
49978 
49979 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
49980                           TargetLowering::DAGCombinerInfo &DCI,
49981                           const X86Subtarget &Subtarget) {
49982   EVT VT = N->getValueType(0);
49983   SDValue Op0 = N->getOperand(0);
49984   SDValue Op1 = N->getOperand(1);
49985 
49986   if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
49987     return Select;
49988 
49989   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49990     return MAdd;
49991   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49992     return MAdd;
49993 
49994   // Try to synthesize horizontal adds from adds of shuffles.
49995   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49996     return V;
49997 
49998   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
49999   // (sub Y, (sext (vXi1 X))).
50000   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
50001   // generic DAG combine without a legal type check, but adding this there
50002   // caused regressions.
50003   if (VT.isVector()) {
50004     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50005     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
50006         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50007         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
50008       SDLoc DL(N);
50009       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
50010       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
50011     }
50012 
50013     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
50014         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50015         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
50016       SDLoc DL(N);
50017       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
50018       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
50019     }
50020   }
50021 
50022   return combineAddOrSubToADCOrSBB(N, DAG);
50023 }
50024 
50025 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
50026                           TargetLowering::DAGCombinerInfo &DCI,
50027                           const X86Subtarget &Subtarget) {
50028   SDValue Op0 = N->getOperand(0);
50029   SDValue Op1 = N->getOperand(1);
50030 
50031   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
50032   auto IsNonOpaqueConstant = [&](SDValue Op) {
50033     if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
50034       if (auto *Cst = dyn_cast<ConstantSDNode>(C))
50035         return !Cst->isOpaque();
50036       return true;
50037     }
50038     return false;
50039   };
50040 
50041   // X86 can't encode an immediate LHS of a sub. See if we can push the
50042   // negation into a preceding instruction. If the RHS of the sub is a XOR with
50043   // one use and a constant, invert the immediate, saving one register.
50044   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
50045   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
50046       IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
50047     SDLoc DL(N);
50048     EVT VT = Op0.getValueType();
50049     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
50050                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
50051     SDValue NewAdd =
50052         DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
50053     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
50054   }
50055 
50056   // Try to synthesize horizontal subs from subs of shuffles.
50057   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50058     return V;
50059 
50060   return combineAddOrSubToADCOrSBB(N, DAG);
50061 }
50062 
50063 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
50064                                     const X86Subtarget &Subtarget) {
50065   MVT VT = N->getSimpleValueType(0);
50066   SDLoc DL(N);
50067 
50068   if (N->getOperand(0) == N->getOperand(1)) {
50069     if (N->getOpcode() == X86ISD::PCMPEQ)
50070       return DAG.getConstant(-1, DL, VT);
50071     if (N->getOpcode() == X86ISD::PCMPGT)
50072       return DAG.getConstant(0, DL, VT);
50073   }
50074 
50075   return SDValue();
50076 }
50077 
50078 /// Helper that combines an array of subvector ops as if they were the operands
50079 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
50080 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
50081 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
50082                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
50083                                       TargetLowering::DAGCombinerInfo &DCI,
50084                                       const X86Subtarget &Subtarget) {
50085   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
50086   unsigned EltSizeInBits = VT.getScalarSizeInBits();
50087 
50088   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
50089     return DAG.getUNDEF(VT);
50090 
50091   if (llvm::all_of(Ops, [](SDValue Op) {
50092         return ISD::isBuildVectorAllZeros(Op.getNode());
50093       }))
50094     return getZeroVector(VT, Subtarget, DAG, DL);
50095 
50096   SDValue Op0 = Ops[0];
50097   bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
50098 
50099   // Repeated subvectors.
50100   if (IsSplat &&
50101       (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
50102     // If this broadcast is inserted into both halves, use a larger broadcast.
50103     if (Op0.getOpcode() == X86ISD::VBROADCAST)
50104       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
50105 
50106     // If this scalar/subvector broadcast_load is inserted into both halves, use
50107     // a larger broadcast_load. Update other uses to use an extracted subvector.
50108     if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50109         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
50110       auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
50111       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50112       SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
50113       SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
50114                                                 MemIntr->getMemoryVT(),
50115                                                 MemIntr->getMemOperand());
50116       DAG.ReplaceAllUsesOfValueWith(
50117           Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50118       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50119       return BcastLd;
50120     }
50121 
50122     // If this is a simple subvector load repeated across multiple lanes, then
50123     // broadcast the load. Update other uses to use an extracted subvector.
50124     if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
50125       if (Ld->isSimple() && !Ld->isNonTemporal() &&
50126           Ld->getExtensionType() == ISD::NON_EXTLOAD) {
50127         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50128         SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
50129         SDValue BcastLd =
50130             DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
50131                                     Ld->getMemoryVT(), Ld->getMemOperand());
50132         DAG.ReplaceAllUsesOfValueWith(
50133             Op0,
50134             extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50135         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
50136         return BcastLd;
50137       }
50138     }
50139 
50140     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
50141     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
50142         (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
50143       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
50144                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
50145                                      Op0.getOperand(0),
50146                                      DAG.getIntPtrConstant(0, DL)));
50147 
50148     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
50149     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50150         (Subtarget.hasAVX2() ||
50151          (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
50152         Op0.getOperand(0).getValueType() == VT.getScalarType())
50153       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
50154 
50155     // concat_vectors(extract_subvector(broadcast(x)),
50156     //                extract_subvector(broadcast(x))) -> broadcast(x)
50157     if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50158         Op0.getOperand(0).getValueType() == VT) {
50159       if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
50160           Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
50161         return Op0.getOperand(0);
50162     }
50163   }
50164 
50165   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
50166   // Only concat of subvector high halves which vperm2x128 is best at.
50167   // TODO: This should go in combineX86ShufflesRecursively eventually.
50168   if (VT.is256BitVector() && Ops.size() == 2) {
50169     SDValue Src0 = peekThroughBitcasts(Ops[0]);
50170     SDValue Src1 = peekThroughBitcasts(Ops[1]);
50171     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50172         Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
50173       EVT SrcVT0 = Src0.getOperand(0).getValueType();
50174       EVT SrcVT1 = Src1.getOperand(0).getValueType();
50175       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
50176       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
50177       if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
50178           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
50179           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
50180         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
50181                            DAG.getBitcast(VT, Src0.getOperand(0)),
50182                            DAG.getBitcast(VT, Src1.getOperand(0)),
50183                            DAG.getTargetConstant(0x31, DL, MVT::i8));
50184       }
50185     }
50186   }
50187 
50188   // Repeated opcode.
50189   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
50190   // but it currently struggles with different vector widths.
50191   if (llvm::all_of(Ops, [Op0](SDValue Op) {
50192         return Op.getOpcode() == Op0.getOpcode();
50193       })) {
50194     auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
50195       SmallVector<SDValue> Subs;
50196       for (SDValue SubOp : SubOps)
50197         Subs.push_back(SubOp.getOperand(I));
50198       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
50199     };
50200 
50201     unsigned NumOps = Ops.size();
50202     switch (Op0.getOpcode()) {
50203     case X86ISD::SHUFP: {
50204       // Add SHUFPD support if/when necessary.
50205       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
50206           llvm::all_of(Ops, [Op0](SDValue Op) {
50207             return Op.getOperand(2) == Op0.getOperand(2);
50208           })) {
50209         return DAG.getNode(Op0.getOpcode(), DL, VT,
50210                            ConcatSubOperand(VT, Ops, 0),
50211                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50212       }
50213       break;
50214     }
50215     case X86ISD::PSHUFHW:
50216     case X86ISD::PSHUFLW:
50217     case X86ISD::PSHUFD:
50218       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
50219           Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50220         return DAG.getNode(Op0.getOpcode(), DL, VT,
50221                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50222       }
50223       LLVM_FALLTHROUGH;
50224     case X86ISD::VPERMILPI:
50225       // TODO - add support for vXf64/vXi64 shuffles.
50226       if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
50227           Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50228         SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
50229         Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
50230                           Op0.getOperand(1));
50231         return DAG.getBitcast(VT, Res);
50232       }
50233       break;
50234     case X86ISD::VPERMV3:
50235       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
50236         MVT OpVT = Op0.getSimpleValueType();
50237         int NumSrcElts = OpVT.getVectorNumElements();
50238         SmallVector<int, 64> ConcatMask;
50239         for (unsigned i = 0; i != NumOps; ++i) {
50240           SmallVector<int, 64> SubMask;
50241           SmallVector<SDValue, 2> SubOps;
50242           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
50243                                     SubMask))
50244             break;
50245           for (int M : SubMask) {
50246             if (0 <= M) {
50247               M += M < NumSrcElts ? 0 : NumSrcElts;
50248               M += i * NumSrcElts;
50249             }
50250             ConcatMask.push_back(M);
50251           }
50252         }
50253         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
50254           SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
50255                                           Ops[1].getOperand(0), DAG, DL);
50256           SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
50257                                           Ops[1].getOperand(2), DAG, DL);
50258           MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
50259           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
50260           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
50261           return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
50262         }
50263       }
50264       break;
50265     case X86ISD::VSHLI:
50266     case X86ISD::VSRLI:
50267       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
50268       // TODO: Move this to LowerScalarImmediateShift?
50269       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
50270           llvm::all_of(Ops, [](SDValue Op) {
50271             return Op.getConstantOperandAPInt(1) == 32;
50272           })) {
50273         SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
50274         SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
50275         if (Op0.getOpcode() == X86ISD::VSHLI) {
50276           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50277                                      {8, 0, 8, 2, 8, 4, 8, 6});
50278         } else {
50279           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50280                                      {1, 8, 3, 8, 5, 8, 7, 8});
50281         }
50282         return DAG.getBitcast(VT, Res);
50283       }
50284       LLVM_FALLTHROUGH;
50285     case X86ISD::VSRAI:
50286       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
50287            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50288             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
50289           llvm::all_of(Ops, [Op0](SDValue Op) {
50290             return Op0.getOperand(1) == Op.getOperand(1);
50291           })) {
50292         return DAG.getNode(Op0.getOpcode(), DL, VT,
50293                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50294       }
50295       break;
50296     case X86ISD::VPERMI:
50297     case X86ISD::VROTLI:
50298     case X86ISD::VROTRI:
50299       if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50300           llvm::all_of(Ops, [Op0](SDValue Op) {
50301             return Op0.getOperand(1) == Op.getOperand(1);
50302           })) {
50303         return DAG.getNode(Op0.getOpcode(), DL, VT,
50304                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50305       }
50306       break;
50307     case ISD::AND:
50308     case ISD::OR:
50309     case ISD::XOR:
50310     case X86ISD::ANDNP:
50311       // TODO: Add 256-bit support.
50312       if (!IsSplat && VT.is512BitVector()) {
50313         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50314         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50315                                  NumOps * SrcVT.getVectorNumElements());
50316         return DAG.getNode(Op0.getOpcode(), DL, VT,
50317                            ConcatSubOperand(SrcVT, Ops, 0),
50318                            ConcatSubOperand(SrcVT, Ops, 1));
50319       }
50320       break;
50321     case X86ISD::HADD:
50322     case X86ISD::HSUB:
50323     case X86ISD::FHADD:
50324     case X86ISD::FHSUB:
50325     case X86ISD::PACKSS:
50326     case X86ISD::PACKUS:
50327       if (!IsSplat && VT.is256BitVector() &&
50328           (VT.isFloatingPoint() || Subtarget.hasInt256())) {
50329         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50330         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50331                                  NumOps * SrcVT.getVectorNumElements());
50332         return DAG.getNode(Op0.getOpcode(), DL, VT,
50333                            ConcatSubOperand(SrcVT, Ops, 0),
50334                            ConcatSubOperand(SrcVT, Ops, 1));
50335       }
50336       break;
50337     case X86ISD::PALIGNR:
50338       if (!IsSplat &&
50339           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
50340            (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
50341           llvm::all_of(Ops, [Op0](SDValue Op) {
50342             return Op0.getOperand(2) == Op.getOperand(2);
50343           })) {
50344         return DAG.getNode(Op0.getOpcode(), DL, VT,
50345                            ConcatSubOperand(VT, Ops, 0),
50346                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50347       }
50348       break;
50349     }
50350   }
50351 
50352   // Fold subvector loads into one.
50353   // If needed, look through bitcasts to get to the load.
50354   if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
50355     bool Fast;
50356     const X86TargetLowering *TLI = Subtarget.getTargetLowering();
50357     if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50358                                 *FirstLd->getMemOperand(), &Fast) &&
50359         Fast) {
50360       if (SDValue Ld =
50361               EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
50362         return Ld;
50363     }
50364   }
50365 
50366   return SDValue();
50367 }
50368 
50369 static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
50370                                     TargetLowering::DAGCombinerInfo &DCI,
50371                                     const X86Subtarget &Subtarget) {
50372   EVT VT = N->getValueType(0);
50373   EVT SrcVT = N->getOperand(0).getValueType();
50374   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50375 
50376   // Don't do anything for i1 vectors.
50377   if (VT.getVectorElementType() == MVT::i1)
50378     return SDValue();
50379 
50380   if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
50381     SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
50382     if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
50383                                            DCI, Subtarget))
50384       return R;
50385   }
50386 
50387   return SDValue();
50388 }
50389 
50390 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
50391                                       TargetLowering::DAGCombinerInfo &DCI,
50392                                       const X86Subtarget &Subtarget) {
50393   if (DCI.isBeforeLegalizeOps())
50394     return SDValue();
50395 
50396   MVT OpVT = N->getSimpleValueType(0);
50397 
50398   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
50399 
50400   SDLoc dl(N);
50401   SDValue Vec = N->getOperand(0);
50402   SDValue SubVec = N->getOperand(1);
50403 
50404   uint64_t IdxVal = N->getConstantOperandVal(2);
50405   MVT SubVecVT = SubVec.getSimpleValueType();
50406 
50407   if (Vec.isUndef() && SubVec.isUndef())
50408     return DAG.getUNDEF(OpVT);
50409 
50410   // Inserting undefs/zeros into zeros/undefs is a zero vector.
50411   if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
50412       (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
50413     return getZeroVector(OpVT, Subtarget, DAG, dl);
50414 
50415   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
50416     // If we're inserting into a zero vector and then into a larger zero vector,
50417     // just insert into the larger zero vector directly.
50418     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
50419         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
50420       uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
50421       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50422                          getZeroVector(OpVT, Subtarget, DAG, dl),
50423                          SubVec.getOperand(1),
50424                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
50425     }
50426 
50427     // If we're inserting into a zero vector and our input was extracted from an
50428     // insert into a zero vector of the same type and the extraction was at
50429     // least as large as the original insertion. Just insert the original
50430     // subvector into a zero vector.
50431     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
50432         isNullConstant(SubVec.getOperand(1)) &&
50433         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
50434       SDValue Ins = SubVec.getOperand(0);
50435       if (isNullConstant(Ins.getOperand(2)) &&
50436           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
50437           Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
50438               SubVecVT.getFixedSizeInBits())
50439         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50440                            getZeroVector(OpVT, Subtarget, DAG, dl),
50441                            Ins.getOperand(1), N->getOperand(2));
50442     }
50443   }
50444 
50445   // Stop here if this is an i1 vector.
50446   if (IsI1Vector)
50447     return SDValue();
50448 
50449   // If this is an insert of an extract, combine to a shuffle. Don't do this
50450   // if the insert or extract can be represented with a subregister operation.
50451   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50452       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
50453       (IdxVal != 0 ||
50454        !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
50455     int ExtIdxVal = SubVec.getConstantOperandVal(1);
50456     if (ExtIdxVal != 0) {
50457       int VecNumElts = OpVT.getVectorNumElements();
50458       int SubVecNumElts = SubVecVT.getVectorNumElements();
50459       SmallVector<int, 64> Mask(VecNumElts);
50460       // First create an identity shuffle mask.
50461       for (int i = 0; i != VecNumElts; ++i)
50462         Mask[i] = i;
50463       // Now insert the extracted portion.
50464       for (int i = 0; i != SubVecNumElts; ++i)
50465         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
50466 
50467       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
50468     }
50469   }
50470 
50471   // Match concat_vector style patterns.
50472   SmallVector<SDValue, 2> SubVectorOps;
50473   if (collectConcatOps(N, SubVectorOps)) {
50474     if (SDValue Fold =
50475             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
50476       return Fold;
50477 
50478     // If we're inserting all zeros into the upper half, change this to
50479     // a concat with zero. We will match this to a move
50480     // with implicit upper bit zeroing during isel.
50481     // We do this here because we don't want combineConcatVectorOps to
50482     // create INSERT_SUBVECTOR from CONCAT_VECTORS.
50483     if (SubVectorOps.size() == 2 &&
50484         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
50485       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50486                          getZeroVector(OpVT, Subtarget, DAG, dl),
50487                          SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
50488   }
50489 
50490   // If this is a broadcast insert into an upper undef, use a larger broadcast.
50491   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
50492     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
50493 
50494   // If this is a broadcast load inserted into an upper undef, use a larger
50495   // broadcast load.
50496   if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
50497       SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
50498     auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
50499     SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
50500     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
50501     SDValue BcastLd =
50502         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
50503                                 MemIntr->getMemoryVT(),
50504                                 MemIntr->getMemOperand());
50505     DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50506     return BcastLd;
50507   }
50508 
50509   // If we're splatting the lower half subvector of a full vector load into the
50510   // upper half, attempt to create a subvector broadcast.
50511   if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
50512       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
50513     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
50514     auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
50515     if (VecLd && SubLd &&
50516         DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
50517                                            SubVec.getValueSizeInBits() / 8, 0))
50518       return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
50519                                SubLd, 0, DAG);
50520   }
50521 
50522   return SDValue();
50523 }
50524 
50525 /// If we are extracting a subvector of a vector select and the select condition
50526 /// is composed of concatenated vectors, try to narrow the select width. This
50527 /// is a common pattern for AVX1 integer code because 256-bit selects may be
50528 /// legal, but there is almost no integer math/logic available for 256-bit.
50529 /// This function should only be called with legal types (otherwise, the calls
50530 /// to get simple value types will assert).
50531 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
50532   SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
50533   SmallVector<SDValue, 4> CatOps;
50534   if (Sel.getOpcode() != ISD::VSELECT ||
50535       !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
50536     return SDValue();
50537 
50538   // Note: We assume simple value types because this should only be called with
50539   //       legal operations/types.
50540   // TODO: This can be extended to handle extraction to 256-bits.
50541   MVT VT = Ext->getSimpleValueType(0);
50542   if (!VT.is128BitVector())
50543     return SDValue();
50544 
50545   MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
50546   if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
50547     return SDValue();
50548 
50549   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
50550   MVT SelVT = Sel.getSimpleValueType();
50551   assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
50552          "Unexpected vector type with legal operations");
50553 
50554   unsigned SelElts = SelVT.getVectorNumElements();
50555   unsigned CastedElts = WideVT.getVectorNumElements();
50556   unsigned ExtIdx = Ext->getConstantOperandVal(1);
50557   if (SelElts % CastedElts == 0) {
50558     // The select has the same or more (narrower) elements than the extract
50559     // operand. The extraction index gets scaled by that factor.
50560     ExtIdx *= (SelElts / CastedElts);
50561   } else if (CastedElts % SelElts == 0) {
50562     // The select has less (wider) elements than the extract operand. Make sure
50563     // that the extraction index can be divided evenly.
50564     unsigned IndexDivisor = CastedElts / SelElts;
50565     if (ExtIdx % IndexDivisor != 0)
50566       return SDValue();
50567     ExtIdx /= IndexDivisor;
50568   } else {
50569     llvm_unreachable("Element count of simple vector types are not divisible?");
50570   }
50571 
50572   unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
50573   unsigned NarrowElts = SelElts / NarrowingFactor;
50574   MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
50575   SDLoc DL(Ext);
50576   SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
50577   SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
50578   SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
50579   SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
50580   return DAG.getBitcast(VT, NarrowSel);
50581 }
50582 
50583 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
50584                                        TargetLowering::DAGCombinerInfo &DCI,
50585                                        const X86Subtarget &Subtarget) {
50586   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
50587   // eventually get combined/lowered into ANDNP) with a concatenated operand,
50588   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
50589   // We let generic combining take over from there to simplify the
50590   // insert/extract and 'not'.
50591   // This pattern emerges during AVX1 legalization. We handle it before lowering
50592   // to avoid complications like splitting constant vector loads.
50593 
50594   // Capture the original wide type in the likely case that we need to bitcast
50595   // back to this type.
50596   if (!N->getValueType(0).isSimple())
50597     return SDValue();
50598 
50599   MVT VT = N->getSimpleValueType(0);
50600   SDValue InVec = N->getOperand(0);
50601   unsigned IdxVal = N->getConstantOperandVal(1);
50602   SDValue InVecBC = peekThroughBitcasts(InVec);
50603   EVT InVecVT = InVec.getValueType();
50604   unsigned SizeInBits = VT.getSizeInBits();
50605   unsigned InSizeInBits = InVecVT.getSizeInBits();
50606   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50607 
50608   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
50609       TLI.isTypeLegal(InVecVT) &&
50610       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
50611     auto isConcatenatedNot = [](SDValue V) {
50612       V = peekThroughBitcasts(V);
50613       if (!isBitwiseNot(V))
50614         return false;
50615       SDValue NotOp = V->getOperand(0);
50616       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
50617     };
50618     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
50619         isConcatenatedNot(InVecBC.getOperand(1))) {
50620       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
50621       SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
50622       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
50623                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
50624     }
50625   }
50626 
50627   if (DCI.isBeforeLegalizeOps())
50628     return SDValue();
50629 
50630   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
50631     return V;
50632 
50633   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
50634     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50635 
50636   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
50637     if (VT.getScalarType() == MVT::i1)
50638       return DAG.getConstant(1, SDLoc(N), VT);
50639     return getOnesVector(VT, DAG, SDLoc(N));
50640   }
50641 
50642   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
50643     return DAG.getBuildVector(
50644         VT, SDLoc(N),
50645         InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
50646 
50647   // If we are extracting from an insert into a zero vector, replace with a
50648   // smaller insert into zero if we don't access less than the original
50649   // subvector. Don't do this for i1 vectors.
50650   if (VT.getVectorElementType() != MVT::i1 &&
50651       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
50652       InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
50653       ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
50654       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
50655     SDLoc DL(N);
50656     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
50657                        getZeroVector(VT, Subtarget, DAG, DL),
50658                        InVec.getOperand(1), InVec.getOperand(2));
50659   }
50660 
50661   // If we're extracting an upper subvector from a broadcast we should just
50662   // extract the lowest subvector instead which should allow
50663   // SimplifyDemandedVectorElts do more simplifications.
50664   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
50665                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50666                       DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
50667     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50668 
50669   // If we're extracting a broadcasted subvector, just use the lowest subvector.
50670   if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50671       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
50672     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50673 
50674   // Attempt to extract from the source of a shuffle vector.
50675   if ((InSizeInBits % SizeInBits) == 0 &&
50676       (IdxVal % VT.getVectorNumElements()) == 0) {
50677     SmallVector<int, 32> ShuffleMask;
50678     SmallVector<int, 32> ScaledMask;
50679     SmallVector<SDValue, 2> ShuffleInputs;
50680     unsigned NumSubVecs = InSizeInBits / SizeInBits;
50681     // Decode the shuffle mask and scale it so its shuffling subvectors.
50682     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
50683         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
50684       unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
50685       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
50686         return DAG.getUNDEF(VT);
50687       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
50688         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50689       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
50690       if (Src.getValueSizeInBits() == InSizeInBits) {
50691         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
50692         unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
50693         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
50694                                 SDLoc(N), SizeInBits);
50695       }
50696     }
50697   }
50698 
50699   // If we're extracting the lowest subvector and we're the only user,
50700   // we may be able to perform this with a smaller vector width.
50701   unsigned InOpcode = InVec.getOpcode();
50702   if (IdxVal == 0 && InVec.hasOneUse()) {
50703     if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
50704       // v2f64 CVTDQ2PD(v4i32).
50705       if (InOpcode == ISD::SINT_TO_FP &&
50706           InVec.getOperand(0).getValueType() == MVT::v4i32) {
50707         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
50708       }
50709       // v2f64 CVTUDQ2PD(v4i32).
50710       if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
50711           InVec.getOperand(0).getValueType() == MVT::v4i32) {
50712         return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
50713       }
50714       // v2f64 CVTPS2PD(v4f32).
50715       if (InOpcode == ISD::FP_EXTEND &&
50716           InVec.getOperand(0).getValueType() == MVT::v4f32) {
50717         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
50718       }
50719     }
50720     if ((InOpcode == ISD::ANY_EXTEND ||
50721          InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50722          InOpcode == ISD::ZERO_EXTEND ||
50723          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
50724          InOpcode == ISD::SIGN_EXTEND ||
50725          InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50726         (SizeInBits == 128 || SizeInBits == 256) &&
50727         InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
50728       SDLoc DL(N);
50729       SDValue Ext = InVec.getOperand(0);
50730       if (Ext.getValueSizeInBits() > SizeInBits)
50731         Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
50732       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
50733       return DAG.getNode(ExtOp, DL, VT, Ext);
50734     }
50735     if (InOpcode == ISD::VSELECT &&
50736         InVec.getOperand(0).getValueType().is256BitVector() &&
50737         InVec.getOperand(1).getValueType().is256BitVector() &&
50738         InVec.getOperand(2).getValueType().is256BitVector()) {
50739       SDLoc DL(N);
50740       SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
50741       SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
50742       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
50743       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
50744     }
50745     if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
50746         (VT.is128BitVector() || VT.is256BitVector())) {
50747       SDLoc DL(N);
50748       SDValue InVecSrc = InVec.getOperand(0);
50749       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
50750       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
50751       return DAG.getNode(InOpcode, DL, VT, Ext);
50752     }
50753   }
50754 
50755   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
50756   // as this is very likely to fold into a shuffle/truncation.
50757   if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
50758       InVecVT.getScalarSizeInBits() == 64 &&
50759       InVec.getConstantOperandAPInt(1) == 32) {
50760     SDLoc DL(N);
50761     SDValue Ext =
50762         extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
50763     return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
50764   }
50765 
50766   return SDValue();
50767 }
50768 
50769 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
50770   EVT VT = N->getValueType(0);
50771   SDValue Src = N->getOperand(0);
50772   SDLoc DL(N);
50773 
50774   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
50775   // This occurs frequently in our masked scalar intrinsic code and our
50776   // floating point select lowering with AVX512.
50777   // TODO: SimplifyDemandedBits instead?
50778   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
50779     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50780       if (C->getAPIntValue().isOneValue())
50781         return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
50782                            Src.getOperand(0));
50783 
50784   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
50785   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50786       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
50787       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
50788     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50789       if (C->isNullValue())
50790         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
50791                            Src.getOperand(1));
50792 
50793   // Reduce v2i64 to v4i32 if we don't need the upper bits.
50794   // TODO: Move to DAGCombine/SimplifyDemandedBits?
50795   if (VT == MVT::v2i64 || VT == MVT::v2f64) {
50796     auto IsAnyExt64 = [](SDValue Op) {
50797       if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
50798         return SDValue();
50799       if (Op.getOpcode() == ISD::ANY_EXTEND &&
50800           Op.getOperand(0).getScalarValueSizeInBits() <= 32)
50801         return Op.getOperand(0);
50802       if (auto *Ld = dyn_cast<LoadSDNode>(Op))
50803         if (Ld->getExtensionType() == ISD::EXTLOAD &&
50804             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
50805           return Op;
50806       return SDValue();
50807     };
50808     if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
50809       return DAG.getBitcast(
50810           VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
50811                           DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
50812   }
50813 
50814   // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
50815   if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
50816       Src.getOperand(0).getValueType() == MVT::x86mmx)
50817     return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
50818 
50819   // See if we're broadcasting the scalar value, in which case just reuse that.
50820   // Ensure the same SDValue from the SDNode use is being used.
50821   if (VT.getScalarType() == Src.getValueType())
50822     for (SDNode *User : Src->uses())
50823       if (User->getOpcode() == X86ISD::VBROADCAST &&
50824           Src == User->getOperand(0)) {
50825         unsigned SizeInBits = VT.getFixedSizeInBits();
50826         unsigned BroadcastSizeInBits =
50827             User->getValueSizeInBits(0).getFixedSize();
50828         if (BroadcastSizeInBits == SizeInBits)
50829           return SDValue(User, 0);
50830         if (BroadcastSizeInBits > SizeInBits)
50831           return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
50832         // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
50833         // coverage.
50834       }
50835 
50836   return SDValue();
50837 }
50838 
50839 // Simplify PMULDQ and PMULUDQ operations.
50840 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
50841                              TargetLowering::DAGCombinerInfo &DCI,
50842                              const X86Subtarget &Subtarget) {
50843   SDValue LHS = N->getOperand(0);
50844   SDValue RHS = N->getOperand(1);
50845 
50846   // Canonicalize constant to RHS.
50847   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
50848       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
50849     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
50850 
50851   // Multiply by zero.
50852   // Don't return RHS as it may contain UNDEFs.
50853   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
50854     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
50855 
50856   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
50857   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50858   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
50859     return SDValue(N, 0);
50860 
50861   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
50862   // convert it to any_extend_invec, due to the LegalOperations check, do the
50863   // conversion directly to a vector shuffle manually. This exposes combine
50864   // opportunities missed by combineEXTEND_VECTOR_INREG not calling
50865   // combineX86ShufflesRecursively on SSE4.1 targets.
50866   // FIXME: This is basically a hack around several other issues related to
50867   // ANY_EXTEND_VECTOR_INREG.
50868   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
50869       (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50870        LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50871       LHS.getOperand(0).getValueType() == MVT::v4i32) {
50872     SDLoc dl(N);
50873     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
50874                                LHS.getOperand(0), { 0, -1, 1, -1 });
50875     LHS = DAG.getBitcast(MVT::v2i64, LHS);
50876     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50877   }
50878   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
50879       (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50880        RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50881       RHS.getOperand(0).getValueType() == MVT::v4i32) {
50882     SDLoc dl(N);
50883     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
50884                                RHS.getOperand(0), { 0, -1, 1, -1 });
50885     RHS = DAG.getBitcast(MVT::v2i64, RHS);
50886     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50887   }
50888 
50889   return SDValue();
50890 }
50891 
50892 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
50893                                           TargetLowering::DAGCombinerInfo &DCI,
50894                                           const X86Subtarget &Subtarget) {
50895   EVT VT = N->getValueType(0);
50896   SDValue In = N->getOperand(0);
50897   unsigned Opcode = N->getOpcode();
50898   unsigned InOpcode = In.getOpcode();
50899   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50900 
50901   // Try to merge vector loads and extend_inreg to an extload.
50902   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
50903       In.hasOneUse()) {
50904     auto *Ld = cast<LoadSDNode>(In);
50905     if (Ld->isSimple()) {
50906       MVT SVT = In.getSimpleValueType().getVectorElementType();
50907       ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
50908                                  ? ISD::SEXTLOAD
50909                                  : ISD::ZEXTLOAD;
50910       EVT MemVT = VT.changeVectorElementType(SVT);
50911       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
50912         SDValue Load =
50913             DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
50914                            Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50915                            Ld->getMemOperand()->getFlags());
50916         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
50917         return Load;
50918       }
50919     }
50920   }
50921 
50922   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
50923   if (Opcode == InOpcode)
50924     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
50925 
50926   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
50927   // -> EXTEND_VECTOR_INREG(X).
50928   // TODO: Handle non-zero subvector indices.
50929   if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
50930       In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
50931       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
50932           In.getValueSizeInBits())
50933     return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
50934 
50935   // Attempt to combine as a shuffle.
50936   // TODO: General ZERO_EXTEND_VECTOR_INREG support.
50937   if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50938       (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
50939     SDValue Op(N, 0);
50940     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
50941       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50942         return Res;
50943   }
50944 
50945   return SDValue();
50946 }
50947 
50948 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
50949                              TargetLowering::DAGCombinerInfo &DCI) {
50950   EVT VT = N->getValueType(0);
50951 
50952   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50953     return DAG.getConstant(0, SDLoc(N), VT);
50954 
50955   APInt KnownUndef, KnownZero;
50956   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50957   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
50958   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50959                                      KnownZero, DCI))
50960     return SDValue(N, 0);
50961 
50962   return SDValue();
50963 }
50964 
50965 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
50966 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
50967 // extra instructions between the conversion due to going to scalar and back.
50968 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
50969                                  const X86Subtarget &Subtarget) {
50970   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
50971     return SDValue();
50972 
50973   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
50974     return SDValue();
50975 
50976   if (N->getValueType(0) != MVT::f32 ||
50977       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
50978     return SDValue();
50979 
50980   SDLoc dl(N);
50981   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
50982                             N->getOperand(0).getOperand(0));
50983   Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
50984                     DAG.getTargetConstant(4, dl, MVT::i32));
50985   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
50986   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
50987                      DAG.getIntPtrConstant(0, dl));
50988 }
50989 
50990 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
50991                                 const X86Subtarget &Subtarget) {
50992   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50993     return SDValue();
50994 
50995   bool IsStrict = N->isStrictFPOpcode();
50996   EVT VT = N->getValueType(0);
50997   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50998   EVT SrcVT = Src.getValueType();
50999 
51000   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
51001     return SDValue();
51002 
51003   if (VT.getVectorElementType() != MVT::f32 &&
51004       VT.getVectorElementType() != MVT::f64)
51005     return SDValue();
51006 
51007   unsigned NumElts = VT.getVectorNumElements();
51008   if (NumElts == 1 || !isPowerOf2_32(NumElts))
51009     return SDValue();
51010 
51011   SDLoc dl(N);
51012 
51013   // Convert the input to vXi16.
51014   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
51015   Src = DAG.getBitcast(IntVT, Src);
51016 
51017   // Widen to at least 8 input elements.
51018   if (NumElts < 8) {
51019     unsigned NumConcats = 8 / NumElts;
51020     SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
51021                                 : DAG.getConstant(0, dl, IntVT);
51022     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
51023     Ops[0] = Src;
51024     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
51025   }
51026 
51027   // Destination is vXf32 with at least 4 elements.
51028   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
51029                                std::max(4U, NumElts));
51030   SDValue Cvt, Chain;
51031   if (IsStrict) {
51032     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
51033                       {N->getOperand(0), Src});
51034     Chain = Cvt.getValue(1);
51035   } else {
51036     Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
51037   }
51038 
51039   if (NumElts < 4) {
51040     assert(NumElts == 2 && "Unexpected size");
51041     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
51042                       DAG.getIntPtrConstant(0, dl));
51043   }
51044 
51045   if (IsStrict) {
51046     // Extend to the original VT if necessary.
51047     if (Cvt.getValueType() != VT) {
51048       Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
51049                         {Chain, Cvt});
51050       Chain = Cvt.getValue(1);
51051     }
51052     return DAG.getMergeValues({Cvt, Chain}, dl);
51053   }
51054 
51055   // Extend to the original VT if necessary.
51056   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
51057 }
51058 
51059 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
51060 // from. Limit this to cases where the loads have the same input chain and the
51061 // output chains are unused. This avoids any memory ordering issues.
51062 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
51063                                      TargetLowering::DAGCombinerInfo &DCI) {
51064   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
51065           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
51066          "Unknown broadcast load type");
51067 
51068   // Only do this if the chain result is unused.
51069   if (N->hasAnyUseOfValue(1))
51070     return SDValue();
51071 
51072   auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
51073 
51074   SDValue Ptr = MemIntrin->getBasePtr();
51075   SDValue Chain = MemIntrin->getChain();
51076   EVT VT = N->getSimpleValueType(0);
51077   EVT MemVT = MemIntrin->getMemoryVT();
51078 
51079   // Look at other users of our base pointer and try to find a wider broadcast.
51080   // The input chain and the size of the memory VT must match.
51081   for (SDNode *User : Ptr->uses())
51082     if (User != N && User->getOpcode() == N->getOpcode() &&
51083         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51084         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51085         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51086             MemVT.getSizeInBits() &&
51087         !User->hasAnyUseOfValue(1) &&
51088         User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
51089       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51090                                          VT.getSizeInBits());
51091       Extract = DAG.getBitcast(VT, Extract);
51092       return DCI.CombineTo(N, Extract, SDValue(User, 1));
51093     }
51094 
51095   return SDValue();
51096 }
51097 
51098 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
51099                                const X86Subtarget &Subtarget) {
51100   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51101     return SDValue();
51102 
51103   EVT VT = N->getValueType(0);
51104   SDValue Src = N->getOperand(0);
51105   EVT SrcVT = Src.getValueType();
51106 
51107   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
51108       SrcVT.getVectorElementType() != MVT::f32)
51109     return SDValue();
51110 
51111   unsigned NumElts = VT.getVectorNumElements();
51112   if (NumElts == 1 || !isPowerOf2_32(NumElts))
51113     return SDValue();
51114 
51115   SDLoc dl(N);
51116 
51117   // Widen to at least 4 input elements.
51118   if (NumElts < 4)
51119     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
51120                       DAG.getConstantFP(0.0, dl, SrcVT));
51121 
51122   // Destination is v8i16 with at least 8 elements.
51123   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51124                                std::max(8U, NumElts));
51125   SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
51126                             DAG.getTargetConstant(4, dl, MVT::i32));
51127 
51128   // Extract down to real number of elements.
51129   if (NumElts < 8) {
51130     EVT IntVT = VT.changeVectorElementTypeToInteger();
51131     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
51132                       DAG.getIntPtrConstant(0, dl));
51133   }
51134 
51135   return DAG.getBitcast(VT, Cvt);
51136 }
51137 
51138 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
51139   SDValue Src = N->getOperand(0);
51140 
51141   // Turn MOVDQ2Q+simple_load into an mmx load.
51142   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
51143     LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
51144 
51145     if (LN->isSimple()) {
51146       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
51147                                   LN->getBasePtr(),
51148                                   LN->getPointerInfo(),
51149                                   LN->getOriginalAlign(),
51150                                   LN->getMemOperand()->getFlags());
51151       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
51152       return NewLd;
51153     }
51154   }
51155 
51156   return SDValue();
51157 }
51158 
51159 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
51160                            TargetLowering::DAGCombinerInfo &DCI) {
51161   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
51162   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51163   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
51164                                APInt::getAllOnesValue(NumBits), DCI))
51165     return SDValue(N, 0);
51166 
51167   return SDValue();
51168 }
51169 
51170 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
51171                                              DAGCombinerInfo &DCI) const {
51172   SelectionDAG &DAG = DCI.DAG;
51173   switch (N->getOpcode()) {
51174   default: break;
51175   case ISD::SCALAR_TO_VECTOR:
51176     return combineScalarToVector(N, DAG);
51177   case ISD::EXTRACT_VECTOR_ELT:
51178   case X86ISD::PEXTRW:
51179   case X86ISD::PEXTRB:
51180     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
51181   case ISD::CONCAT_VECTORS:
51182     return combineConcatVectors(N, DAG, DCI, Subtarget);
51183   case ISD::INSERT_SUBVECTOR:
51184     return combineInsertSubvector(N, DAG, DCI, Subtarget);
51185   case ISD::EXTRACT_SUBVECTOR:
51186     return combineExtractSubvector(N, DAG, DCI, Subtarget);
51187   case ISD::VSELECT:
51188   case ISD::SELECT:
51189   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
51190   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
51191   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
51192   case X86ISD::CMP:         return combineCMP(N, DAG);
51193   case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
51194   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
51195   case X86ISD::ADD:
51196   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
51197   case X86ISD::SBB:         return combineSBB(N, DAG);
51198   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
51199   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
51200   case ISD::SHL:            return combineShiftLeft(N, DAG);
51201   case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
51202   case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
51203   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
51204   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
51205   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
51206   case X86ISD::BEXTR:
51207   case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
51208   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
51209   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
51210   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
51211   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
51212   case X86ISD::VEXTRACT_STORE:
51213     return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
51214   case ISD::SINT_TO_FP:
51215   case ISD::STRICT_SINT_TO_FP:
51216     return combineSIntToFP(N, DAG, DCI, Subtarget);
51217   case ISD::UINT_TO_FP:
51218   case ISD::STRICT_UINT_TO_FP:
51219     return combineUIntToFP(N, DAG, Subtarget);
51220   case ISD::FADD:
51221   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
51222   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
51223   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
51224   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
51225   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
51226   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
51227   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
51228   case X86ISD::FXOR:
51229   case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
51230   case X86ISD::FMIN:
51231   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
51232   case ISD::FMINNUM:
51233   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
51234   case X86ISD::CVTSI2P:
51235   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
51236   case X86ISD::CVTP2SI:
51237   case X86ISD::CVTP2UI:
51238   case X86ISD::STRICT_CVTTP2SI:
51239   case X86ISD::CVTTP2SI:
51240   case X86ISD::STRICT_CVTTP2UI:
51241   case X86ISD::CVTTP2UI:
51242                             return combineCVTP2I_CVTTP2I(N, DAG, DCI);
51243   case X86ISD::STRICT_CVTPH2PS:
51244   case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
51245   case X86ISD::BT:          return combineBT(N, DAG, DCI);
51246   case ISD::ANY_EXTEND:
51247   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
51248   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
51249   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
51250   case ISD::ANY_EXTEND_VECTOR_INREG:
51251   case ISD::SIGN_EXTEND_VECTOR_INREG:
51252   case ISD::ZERO_EXTEND_VECTOR_INREG:
51253     return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
51254   case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
51255   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
51256   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
51257   case X86ISD::PACKSS:
51258   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
51259   case X86ISD::HADD:
51260   case X86ISD::HSUB:
51261   case X86ISD::FHADD:
51262   case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
51263   case X86ISD::VSHL:
51264   case X86ISD::VSRA:
51265   case X86ISD::VSRL:
51266     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
51267   case X86ISD::VSHLI:
51268   case X86ISD::VSRAI:
51269   case X86ISD::VSRLI:
51270     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
51271   case ISD::INSERT_VECTOR_ELT:
51272   case X86ISD::PINSRB:
51273   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
51274   case X86ISD::SHUFP:       // Handle all target specific shuffles
51275   case X86ISD::INSERTPS:
51276   case X86ISD::EXTRQI:
51277   case X86ISD::INSERTQI:
51278   case X86ISD::VALIGN:
51279   case X86ISD::PALIGNR:
51280   case X86ISD::VSHLDQ:
51281   case X86ISD::VSRLDQ:
51282   case X86ISD::BLENDI:
51283   case X86ISD::UNPCKH:
51284   case X86ISD::UNPCKL:
51285   case X86ISD::MOVHLPS:
51286   case X86ISD::MOVLHPS:
51287   case X86ISD::PSHUFB:
51288   case X86ISD::PSHUFD:
51289   case X86ISD::PSHUFHW:
51290   case X86ISD::PSHUFLW:
51291   case X86ISD::MOVSHDUP:
51292   case X86ISD::MOVSLDUP:
51293   case X86ISD::MOVDDUP:
51294   case X86ISD::MOVSS:
51295   case X86ISD::MOVSD:
51296   case X86ISD::VBROADCAST:
51297   case X86ISD::VPPERM:
51298   case X86ISD::VPERMI:
51299   case X86ISD::VPERMV:
51300   case X86ISD::VPERMV3:
51301   case X86ISD::VPERMIL2:
51302   case X86ISD::VPERMILPI:
51303   case X86ISD::VPERMILPV:
51304   case X86ISD::VPERM2X128:
51305   case X86ISD::SHUF128:
51306   case X86ISD::VZEXT_MOVL:
51307   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
51308   case X86ISD::FMADD_RND:
51309   case X86ISD::FMSUB:
51310   case X86ISD::STRICT_FMSUB:
51311   case X86ISD::FMSUB_RND:
51312   case X86ISD::FNMADD:
51313   case X86ISD::STRICT_FNMADD:
51314   case X86ISD::FNMADD_RND:
51315   case X86ISD::FNMSUB:
51316   case X86ISD::STRICT_FNMSUB:
51317   case X86ISD::FNMSUB_RND:
51318   case ISD::FMA:
51319   case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
51320   case X86ISD::FMADDSUB_RND:
51321   case X86ISD::FMSUBADD_RND:
51322   case X86ISD::FMADDSUB:
51323   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
51324   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
51325   case X86ISD::MGATHER:
51326   case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
51327   case ISD::MGATHER:
51328   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
51329   case X86ISD::PCMPEQ:
51330   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
51331   case X86ISD::PMULDQ:
51332   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
51333   case X86ISD::KSHIFTL:
51334   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
51335   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
51336   case ISD::STRICT_FP_EXTEND:
51337   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
51338   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
51339   case X86ISD::VBROADCAST_LOAD:
51340   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
51341   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
51342   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
51343   }
51344 
51345   return SDValue();
51346 }
51347 
51348 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
51349   if (!isTypeLegal(VT))
51350     return false;
51351 
51352   // There are no vXi8 shifts.
51353   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
51354     return false;
51355 
51356   // TODO: Almost no 8-bit ops are desirable because they have no actual
51357   //       size/speed advantages vs. 32-bit ops, but they do have a major
51358   //       potential disadvantage by causing partial register stalls.
51359   //
51360   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
51361   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
51362   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
51363   // check for a constant operand to the multiply.
51364   if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
51365     return false;
51366 
51367   // i16 instruction encodings are longer and some i16 instructions are slow,
51368   // so those are not desirable.
51369   if (VT == MVT::i16) {
51370     switch (Opc) {
51371     default:
51372       break;
51373     case ISD::LOAD:
51374     case ISD::SIGN_EXTEND:
51375     case ISD::ZERO_EXTEND:
51376     case ISD::ANY_EXTEND:
51377     case ISD::SHL:
51378     case ISD::SRA:
51379     case ISD::SRL:
51380     case ISD::SUB:
51381     case ISD::ADD:
51382     case ISD::MUL:
51383     case ISD::AND:
51384     case ISD::OR:
51385     case ISD::XOR:
51386       return false;
51387     }
51388   }
51389 
51390   // Any legal type not explicitly accounted for above here is desirable.
51391   return true;
51392 }
51393 
51394 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
51395                                                   SDValue Value, SDValue Addr,
51396                                                   SelectionDAG &DAG) const {
51397   const Module *M = DAG.getMachineFunction().getMMI().getModule();
51398   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
51399   if (IsCFProtectionSupported) {
51400     // In case control-flow branch protection is enabled, we need to add
51401     // notrack prefix to the indirect branch.
51402     // In order to do that we create NT_BRIND SDNode.
51403     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
51404     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
51405   }
51406 
51407   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
51408 }
51409 
51410 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
51411   EVT VT = Op.getValueType();
51412   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
51413                              isa<ConstantSDNode>(Op.getOperand(1));
51414 
51415   // i16 is legal, but undesirable since i16 instruction encodings are longer
51416   // and some i16 instructions are slow.
51417   // 8-bit multiply-by-constant can usually be expanded to something cheaper
51418   // using LEA and/or other ALU ops.
51419   if (VT != MVT::i16 && !Is8BitMulByConstant)
51420     return false;
51421 
51422   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
51423     if (!Op.hasOneUse())
51424       return false;
51425     SDNode *User = *Op->use_begin();
51426     if (!ISD::isNormalStore(User))
51427       return false;
51428     auto *Ld = cast<LoadSDNode>(Load);
51429     auto *St = cast<StoreSDNode>(User);
51430     return Ld->getBasePtr() == St->getBasePtr();
51431   };
51432 
51433   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
51434     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
51435       return false;
51436     if (!Op.hasOneUse())
51437       return false;
51438     SDNode *User = *Op->use_begin();
51439     if (User->getOpcode() != ISD::ATOMIC_STORE)
51440       return false;
51441     auto *Ld = cast<AtomicSDNode>(Load);
51442     auto *St = cast<AtomicSDNode>(User);
51443     return Ld->getBasePtr() == St->getBasePtr();
51444   };
51445 
51446   bool Commute = false;
51447   switch (Op.getOpcode()) {
51448   default: return false;
51449   case ISD::SIGN_EXTEND:
51450   case ISD::ZERO_EXTEND:
51451   case ISD::ANY_EXTEND:
51452     break;
51453   case ISD::SHL:
51454   case ISD::SRA:
51455   case ISD::SRL: {
51456     SDValue N0 = Op.getOperand(0);
51457     // Look out for (store (shl (load), x)).
51458     if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
51459       return false;
51460     break;
51461   }
51462   case ISD::ADD:
51463   case ISD::MUL:
51464   case ISD::AND:
51465   case ISD::OR:
51466   case ISD::XOR:
51467     Commute = true;
51468     LLVM_FALLTHROUGH;
51469   case ISD::SUB: {
51470     SDValue N0 = Op.getOperand(0);
51471     SDValue N1 = Op.getOperand(1);
51472     // Avoid disabling potential load folding opportunities.
51473     if (MayFoldLoad(N1) &&
51474         (!Commute || !isa<ConstantSDNode>(N0) ||
51475          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
51476       return false;
51477     if (MayFoldLoad(N0) &&
51478         ((Commute && !isa<ConstantSDNode>(N1)) ||
51479          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
51480       return false;
51481     if (IsFoldableAtomicRMW(N0, Op) ||
51482         (Commute && IsFoldableAtomicRMW(N1, Op)))
51483       return false;
51484   }
51485   }
51486 
51487   PVT = MVT::i32;
51488   return true;
51489 }
51490 
51491 //===----------------------------------------------------------------------===//
51492 //                           X86 Inline Assembly Support
51493 //===----------------------------------------------------------------------===//
51494 
51495 // Helper to match a string separated by whitespace.
51496 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
51497   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
51498 
51499   for (StringRef Piece : Pieces) {
51500     if (!S.startswith(Piece)) // Check if the piece matches.
51501       return false;
51502 
51503     S = S.substr(Piece.size());
51504     StringRef::size_type Pos = S.find_first_not_of(" \t");
51505     if (Pos == 0) // We matched a prefix.
51506       return false;
51507 
51508     S = S.substr(Pos);
51509   }
51510 
51511   return S.empty();
51512 }
51513 
51514 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
51515 
51516   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
51517     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
51518         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
51519         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
51520 
51521       if (AsmPieces.size() == 3)
51522         return true;
51523       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
51524         return true;
51525     }
51526   }
51527   return false;
51528 }
51529 
51530 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
51531   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
51532 
51533   const std::string &AsmStr = IA->getAsmString();
51534 
51535   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
51536   if (!Ty || Ty->getBitWidth() % 16 != 0)
51537     return false;
51538 
51539   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
51540   SmallVector<StringRef, 4> AsmPieces;
51541   SplitString(AsmStr, AsmPieces, ";\n");
51542 
51543   switch (AsmPieces.size()) {
51544   default: return false;
51545   case 1:
51546     // FIXME: this should verify that we are targeting a 486 or better.  If not,
51547     // we will turn this bswap into something that will be lowered to logical
51548     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
51549     // lower so don't worry about this.
51550     // bswap $0
51551     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
51552         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
51553         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
51554         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
51555         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
51556         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
51557       // No need to check constraints, nothing other than the equivalent of
51558       // "=r,0" would be valid here.
51559       return IntrinsicLowering::LowerToByteSwap(CI);
51560     }
51561 
51562     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
51563     if (CI->getType()->isIntegerTy(16) &&
51564         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51565         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
51566          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
51567       AsmPieces.clear();
51568       StringRef ConstraintsStr = IA->getConstraintString();
51569       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51570       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51571       if (clobbersFlagRegisters(AsmPieces))
51572         return IntrinsicLowering::LowerToByteSwap(CI);
51573     }
51574     break;
51575   case 3:
51576     if (CI->getType()->isIntegerTy(32) &&
51577         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51578         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
51579         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
51580         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
51581       AsmPieces.clear();
51582       StringRef ConstraintsStr = IA->getConstraintString();
51583       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51584       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51585       if (clobbersFlagRegisters(AsmPieces))
51586         return IntrinsicLowering::LowerToByteSwap(CI);
51587     }
51588 
51589     if (CI->getType()->isIntegerTy(64)) {
51590       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
51591       if (Constraints.size() >= 2 &&
51592           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
51593           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
51594         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
51595         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
51596             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
51597             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
51598           return IntrinsicLowering::LowerToByteSwap(CI);
51599       }
51600     }
51601     break;
51602   }
51603   return false;
51604 }
51605 
51606 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
51607   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
51608                            .Case("{@cca}", X86::COND_A)
51609                            .Case("{@ccae}", X86::COND_AE)
51610                            .Case("{@ccb}", X86::COND_B)
51611                            .Case("{@ccbe}", X86::COND_BE)
51612                            .Case("{@ccc}", X86::COND_B)
51613                            .Case("{@cce}", X86::COND_E)
51614                            .Case("{@ccz}", X86::COND_E)
51615                            .Case("{@ccg}", X86::COND_G)
51616                            .Case("{@ccge}", X86::COND_GE)
51617                            .Case("{@ccl}", X86::COND_L)
51618                            .Case("{@ccle}", X86::COND_LE)
51619                            .Case("{@ccna}", X86::COND_BE)
51620                            .Case("{@ccnae}", X86::COND_B)
51621                            .Case("{@ccnb}", X86::COND_AE)
51622                            .Case("{@ccnbe}", X86::COND_A)
51623                            .Case("{@ccnc}", X86::COND_AE)
51624                            .Case("{@ccne}", X86::COND_NE)
51625                            .Case("{@ccnz}", X86::COND_NE)
51626                            .Case("{@ccng}", X86::COND_LE)
51627                            .Case("{@ccnge}", X86::COND_L)
51628                            .Case("{@ccnl}", X86::COND_GE)
51629                            .Case("{@ccnle}", X86::COND_G)
51630                            .Case("{@ccno}", X86::COND_NO)
51631                            .Case("{@ccnp}", X86::COND_NP)
51632                            .Case("{@ccns}", X86::COND_NS)
51633                            .Case("{@cco}", X86::COND_O)
51634                            .Case("{@ccp}", X86::COND_P)
51635                            .Case("{@ccs}", X86::COND_S)
51636                            .Default(X86::COND_INVALID);
51637   return Cond;
51638 }
51639 
51640 /// Given a constraint letter, return the type of constraint for this target.
51641 X86TargetLowering::ConstraintType
51642 X86TargetLowering::getConstraintType(StringRef Constraint) const {
51643   if (Constraint.size() == 1) {
51644     switch (Constraint[0]) {
51645     case 'R':
51646     case 'q':
51647     case 'Q':
51648     case 'f':
51649     case 't':
51650     case 'u':
51651     case 'y':
51652     case 'x':
51653     case 'v':
51654     case 'l':
51655     case 'k': // AVX512 masking registers.
51656       return C_RegisterClass;
51657     case 'a':
51658     case 'b':
51659     case 'c':
51660     case 'd':
51661     case 'S':
51662     case 'D':
51663     case 'A':
51664       return C_Register;
51665     case 'I':
51666     case 'J':
51667     case 'K':
51668     case 'N':
51669     case 'G':
51670     case 'L':
51671     case 'M':
51672       return C_Immediate;
51673     case 'C':
51674     case 'e':
51675     case 'Z':
51676       return C_Other;
51677     default:
51678       break;
51679     }
51680   }
51681   else if (Constraint.size() == 2) {
51682     switch (Constraint[0]) {
51683     default:
51684       break;
51685     case 'Y':
51686       switch (Constraint[1]) {
51687       default:
51688         break;
51689       case 'z':
51690         return C_Register;
51691       case 'i':
51692       case 'm':
51693       case 'k':
51694       case 't':
51695       case '2':
51696         return C_RegisterClass;
51697       }
51698     }
51699   } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51700     return C_Other;
51701   return TargetLowering::getConstraintType(Constraint);
51702 }
51703 
51704 /// Examine constraint type and operand type and determine a weight value.
51705 /// This object must already have been set up with the operand type
51706 /// and the current alternative constraint selected.
51707 TargetLowering::ConstraintWeight
51708   X86TargetLowering::getSingleConstraintMatchWeight(
51709     AsmOperandInfo &info, const char *constraint) const {
51710   ConstraintWeight weight = CW_Invalid;
51711   Value *CallOperandVal = info.CallOperandVal;
51712     // If we don't have a value, we can't do a match,
51713     // but allow it at the lowest weight.
51714   if (!CallOperandVal)
51715     return CW_Default;
51716   Type *type = CallOperandVal->getType();
51717   // Look at the constraint type.
51718   switch (*constraint) {
51719   default:
51720     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
51721     LLVM_FALLTHROUGH;
51722   case 'R':
51723   case 'q':
51724   case 'Q':
51725   case 'a':
51726   case 'b':
51727   case 'c':
51728   case 'd':
51729   case 'S':
51730   case 'D':
51731   case 'A':
51732     if (CallOperandVal->getType()->isIntegerTy())
51733       weight = CW_SpecificReg;
51734     break;
51735   case 'f':
51736   case 't':
51737   case 'u':
51738     if (type->isFloatingPointTy())
51739       weight = CW_SpecificReg;
51740     break;
51741   case 'y':
51742     if (type->isX86_MMXTy() && Subtarget.hasMMX())
51743       weight = CW_SpecificReg;
51744     break;
51745   case 'Y':
51746     if (StringRef(constraint).size() != 2)
51747       break;
51748     switch (constraint[1]) {
51749       default:
51750         return CW_Invalid;
51751       // XMM0
51752       case 'z':
51753         if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51754             ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
51755             ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
51756           return CW_SpecificReg;
51757         return CW_Invalid;
51758       // Conditional OpMask regs (AVX512)
51759       case 'k':
51760         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51761           return CW_Register;
51762         return CW_Invalid;
51763       // Any MMX reg
51764       case 'm':
51765         if (type->isX86_MMXTy() && Subtarget.hasMMX())
51766           return weight;
51767         return CW_Invalid;
51768       // Any SSE reg when ISA >= SSE2, same as 'x'
51769       case 'i':
51770       case 't':
51771       case '2':
51772         if (!Subtarget.hasSSE2())
51773           return CW_Invalid;
51774         break;
51775     }
51776     break;
51777   case 'v':
51778     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
51779       weight = CW_Register;
51780     LLVM_FALLTHROUGH;
51781   case 'x':
51782     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51783         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
51784       weight = CW_Register;
51785     break;
51786   case 'k':
51787     // Enable conditional vector operations using %k<#> registers.
51788     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51789       weight = CW_Register;
51790     break;
51791   case 'I':
51792     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
51793       if (C->getZExtValue() <= 31)
51794         weight = CW_Constant;
51795     }
51796     break;
51797   case 'J':
51798     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51799       if (C->getZExtValue() <= 63)
51800         weight = CW_Constant;
51801     }
51802     break;
51803   case 'K':
51804     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51805       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
51806         weight = CW_Constant;
51807     }
51808     break;
51809   case 'L':
51810     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51811       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
51812         weight = CW_Constant;
51813     }
51814     break;
51815   case 'M':
51816     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51817       if (C->getZExtValue() <= 3)
51818         weight = CW_Constant;
51819     }
51820     break;
51821   case 'N':
51822     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51823       if (C->getZExtValue() <= 0xff)
51824         weight = CW_Constant;
51825     }
51826     break;
51827   case 'G':
51828   case 'C':
51829     if (isa<ConstantFP>(CallOperandVal)) {
51830       weight = CW_Constant;
51831     }
51832     break;
51833   case 'e':
51834     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51835       if ((C->getSExtValue() >= -0x80000000LL) &&
51836           (C->getSExtValue() <= 0x7fffffffLL))
51837         weight = CW_Constant;
51838     }
51839     break;
51840   case 'Z':
51841     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51842       if (C->getZExtValue() <= 0xffffffff)
51843         weight = CW_Constant;
51844     }
51845     break;
51846   }
51847   return weight;
51848 }
51849 
51850 /// Try to replace an X constraint, which matches anything, with another that
51851 /// has more specific requirements based on the type of the corresponding
51852 /// operand.
51853 const char *X86TargetLowering::
51854 LowerXConstraint(EVT ConstraintVT) const {
51855   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
51856   // 'f' like normal targets.
51857   if (ConstraintVT.isFloatingPoint()) {
51858     if (Subtarget.hasSSE1())
51859       return "x";
51860   }
51861 
51862   return TargetLowering::LowerXConstraint(ConstraintVT);
51863 }
51864 
51865 // Lower @cc targets via setcc.
51866 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
51867     SDValue &Chain, SDValue &Flag, const SDLoc &DL,
51868     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
51869   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
51870   if (Cond == X86::COND_INVALID)
51871     return SDValue();
51872   // Check that return type is valid.
51873   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
51874       OpInfo.ConstraintVT.getSizeInBits() < 8)
51875     report_fatal_error("Flag output operand is of invalid type");
51876 
51877   // Get EFLAGS register. Only update chain when copyfrom is glued.
51878   if (Flag.getNode()) {
51879     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
51880     Chain = Flag.getValue(1);
51881   } else
51882     Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
51883   // Extract CC code.
51884   SDValue CC = getSETCC(Cond, Flag, DL, DAG);
51885   // Extend to 32-bits
51886   SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
51887 
51888   return Result;
51889 }
51890 
51891 /// Lower the specified operand into the Ops vector.
51892 /// If it is invalid, don't add anything to Ops.
51893 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
51894                                                      std::string &Constraint,
51895                                                      std::vector<SDValue>&Ops,
51896                                                      SelectionDAG &DAG) const {
51897   SDValue Result;
51898 
51899   // Only support length 1 constraints for now.
51900   if (Constraint.length() > 1) return;
51901 
51902   char ConstraintLetter = Constraint[0];
51903   switch (ConstraintLetter) {
51904   default: break;
51905   case 'I':
51906     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51907       if (C->getZExtValue() <= 31) {
51908         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51909                                        Op.getValueType());
51910         break;
51911       }
51912     }
51913     return;
51914   case 'J':
51915     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51916       if (C->getZExtValue() <= 63) {
51917         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51918                                        Op.getValueType());
51919         break;
51920       }
51921     }
51922     return;
51923   case 'K':
51924     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51925       if (isInt<8>(C->getSExtValue())) {
51926         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51927                                        Op.getValueType());
51928         break;
51929       }
51930     }
51931     return;
51932   case 'L':
51933     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51934       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
51935           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
51936         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
51937                                        Op.getValueType());
51938         break;
51939       }
51940     }
51941     return;
51942   case 'M':
51943     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51944       if (C->getZExtValue() <= 3) {
51945         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51946                                        Op.getValueType());
51947         break;
51948       }
51949     }
51950     return;
51951   case 'N':
51952     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51953       if (C->getZExtValue() <= 255) {
51954         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51955                                        Op.getValueType());
51956         break;
51957       }
51958     }
51959     return;
51960   case 'O':
51961     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51962       if (C->getZExtValue() <= 127) {
51963         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51964                                        Op.getValueType());
51965         break;
51966       }
51967     }
51968     return;
51969   case 'e': {
51970     // 32-bit signed value
51971     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51972       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51973                                            C->getSExtValue())) {
51974         // Widen to 64 bits here to get it sign extended.
51975         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
51976         break;
51977       }
51978     // FIXME gcc accepts some relocatable values here too, but only in certain
51979     // memory models; it's complicated.
51980     }
51981     return;
51982   }
51983   case 'Z': {
51984     // 32-bit unsigned value
51985     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51986       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51987                                            C->getZExtValue())) {
51988         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51989                                        Op.getValueType());
51990         break;
51991       }
51992     }
51993     // FIXME gcc accepts some relocatable values here too, but only in certain
51994     // memory models; it's complicated.
51995     return;
51996   }
51997   case 'i': {
51998     // Literal immediates are always ok.
51999     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
52000       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
52001       BooleanContent BCont = getBooleanContents(MVT::i64);
52002       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
52003                                     : ISD::SIGN_EXTEND;
52004       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
52005                                                   : CST->getSExtValue();
52006       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
52007       break;
52008     }
52009 
52010     // In any sort of PIC mode addresses need to be computed at runtime by
52011     // adding in a register or some sort of table lookup.  These can't
52012     // be used as immediates.
52013     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
52014       return;
52015 
52016     // If we are in non-pic codegen mode, we allow the address of a global (with
52017     // an optional displacement) to be used with 'i'.
52018     if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
52019       // If we require an extra load to get this address, as in PIC mode, we
52020       // can't accept it.
52021       if (isGlobalStubReference(
52022               Subtarget.classifyGlobalReference(GA->getGlobal())))
52023         return;
52024     break;
52025   }
52026   }
52027 
52028   if (Result.getNode()) {
52029     Ops.push_back(Result);
52030     return;
52031   }
52032   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
52033 }
52034 
52035 /// Check if \p RC is a general purpose register class.
52036 /// I.e., GR* or one of their variant.
52037 static bool isGRClass(const TargetRegisterClass &RC) {
52038   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
52039          RC.hasSuperClassEq(&X86::GR16RegClass) ||
52040          RC.hasSuperClassEq(&X86::GR32RegClass) ||
52041          RC.hasSuperClassEq(&X86::GR64RegClass) ||
52042          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
52043 }
52044 
52045 /// Check if \p RC is a vector register class.
52046 /// I.e., FR* / VR* or one of their variant.
52047 static bool isFRClass(const TargetRegisterClass &RC) {
52048   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
52049          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
52050          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
52051          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
52052          RC.hasSuperClassEq(&X86::VR512RegClass);
52053 }
52054 
52055 /// Check if \p RC is a mask register class.
52056 /// I.e., VK* or one of their variant.
52057 static bool isVKClass(const TargetRegisterClass &RC) {
52058   return RC.hasSuperClassEq(&X86::VK1RegClass) ||
52059          RC.hasSuperClassEq(&X86::VK2RegClass) ||
52060          RC.hasSuperClassEq(&X86::VK4RegClass) ||
52061          RC.hasSuperClassEq(&X86::VK8RegClass) ||
52062          RC.hasSuperClassEq(&X86::VK16RegClass) ||
52063          RC.hasSuperClassEq(&X86::VK32RegClass) ||
52064          RC.hasSuperClassEq(&X86::VK64RegClass);
52065 }
52066 
52067 std::pair<unsigned, const TargetRegisterClass *>
52068 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
52069                                                 StringRef Constraint,
52070                                                 MVT VT) const {
52071   // First, see if this is a constraint that directly corresponds to an LLVM
52072   // register class.
52073   if (Constraint.size() == 1) {
52074     // GCC Constraint Letters
52075     switch (Constraint[0]) {
52076     default: break;
52077     // 'A' means [ER]AX + [ER]DX.
52078     case 'A':
52079       if (Subtarget.is64Bit())
52080         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
52081       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
52082              "Expecting 64, 32 or 16 bit subtarget");
52083       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52084 
52085       // TODO: Slight differences here in allocation order and leaving
52086       // RIP in the class. Do they matter any more here than they do
52087       // in the normal allocation?
52088     case 'k':
52089       if (Subtarget.hasAVX512()) {
52090         if (VT == MVT::i1)
52091           return std::make_pair(0U, &X86::VK1RegClass);
52092         if (VT == MVT::i8)
52093           return std::make_pair(0U, &X86::VK8RegClass);
52094         if (VT == MVT::i16)
52095           return std::make_pair(0U, &X86::VK16RegClass);
52096       }
52097       if (Subtarget.hasBWI()) {
52098         if (VT == MVT::i32)
52099           return std::make_pair(0U, &X86::VK32RegClass);
52100         if (VT == MVT::i64)
52101           return std::make_pair(0U, &X86::VK64RegClass);
52102       }
52103       break;
52104     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
52105       if (Subtarget.is64Bit()) {
52106         if (VT == MVT::i8 || VT == MVT::i1)
52107           return std::make_pair(0U, &X86::GR8RegClass);
52108         if (VT == MVT::i16)
52109           return std::make_pair(0U, &X86::GR16RegClass);
52110         if (VT == MVT::i32 || VT == MVT::f32)
52111           return std::make_pair(0U, &X86::GR32RegClass);
52112         if (VT != MVT::f80 && !VT.isVector())
52113           return std::make_pair(0U, &X86::GR64RegClass);
52114         break;
52115       }
52116       LLVM_FALLTHROUGH;
52117       // 32-bit fallthrough
52118     case 'Q':   // Q_REGS
52119       if (VT == MVT::i8 || VT == MVT::i1)
52120         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
52121       if (VT == MVT::i16)
52122         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
52123       if (VT == MVT::i32 || VT == MVT::f32 ||
52124           (!VT.isVector() && !Subtarget.is64Bit()))
52125         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
52126       if (VT != MVT::f80 && !VT.isVector())
52127         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
52128       break;
52129     case 'r':   // GENERAL_REGS
52130     case 'l':   // INDEX_REGS
52131       if (VT == MVT::i8 || VT == MVT::i1)
52132         return std::make_pair(0U, &X86::GR8RegClass);
52133       if (VT == MVT::i16)
52134         return std::make_pair(0U, &X86::GR16RegClass);
52135       if (VT == MVT::i32 || VT == MVT::f32 ||
52136           (!VT.isVector() && !Subtarget.is64Bit()))
52137         return std::make_pair(0U, &X86::GR32RegClass);
52138       if (VT != MVT::f80 && !VT.isVector())
52139         return std::make_pair(0U, &X86::GR64RegClass);
52140       break;
52141     case 'R':   // LEGACY_REGS
52142       if (VT == MVT::i8 || VT == MVT::i1)
52143         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
52144       if (VT == MVT::i16)
52145         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
52146       if (VT == MVT::i32 || VT == MVT::f32 ||
52147           (!VT.isVector() && !Subtarget.is64Bit()))
52148         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
52149       if (VT != MVT::f80 && !VT.isVector())
52150         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
52151       break;
52152     case 'f':  // FP Stack registers.
52153       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
52154       // value to the correct fpstack register class.
52155       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
52156         return std::make_pair(0U, &X86::RFP32RegClass);
52157       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
52158         return std::make_pair(0U, &X86::RFP64RegClass);
52159       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
52160         return std::make_pair(0U, &X86::RFP80RegClass);
52161       break;
52162     case 'y':   // MMX_REGS if MMX allowed.
52163       if (!Subtarget.hasMMX()) break;
52164       return std::make_pair(0U, &X86::VR64RegClass);
52165     case 'v':
52166     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
52167       if (!Subtarget.hasSSE1()) break;
52168       bool VConstraint = (Constraint[0] == 'v');
52169 
52170       switch (VT.SimpleTy) {
52171       default: break;
52172       // Scalar SSE types.
52173       case MVT::f32:
52174       case MVT::i32:
52175         if (VConstraint && Subtarget.hasVLX())
52176           return std::make_pair(0U, &X86::FR32XRegClass);
52177         return std::make_pair(0U, &X86::FR32RegClass);
52178       case MVT::f64:
52179       case MVT::i64:
52180         if (VConstraint && Subtarget.hasVLX())
52181           return std::make_pair(0U, &X86::FR64XRegClass);
52182         return std::make_pair(0U, &X86::FR64RegClass);
52183       case MVT::i128:
52184         if (Subtarget.is64Bit()) {
52185           if (VConstraint && Subtarget.hasVLX())
52186             return std::make_pair(0U, &X86::VR128XRegClass);
52187           return std::make_pair(0U, &X86::VR128RegClass);
52188         }
52189         break;
52190       // Vector types and fp128.
52191       case MVT::f128:
52192       case MVT::v16i8:
52193       case MVT::v8i16:
52194       case MVT::v4i32:
52195       case MVT::v2i64:
52196       case MVT::v4f32:
52197       case MVT::v2f64:
52198         if (VConstraint && Subtarget.hasVLX())
52199           return std::make_pair(0U, &X86::VR128XRegClass);
52200         return std::make_pair(0U, &X86::VR128RegClass);
52201       // AVX types.
52202       case MVT::v32i8:
52203       case MVT::v16i16:
52204       case MVT::v8i32:
52205       case MVT::v4i64:
52206       case MVT::v8f32:
52207       case MVT::v4f64:
52208         if (VConstraint && Subtarget.hasVLX())
52209           return std::make_pair(0U, &X86::VR256XRegClass);
52210         if (Subtarget.hasAVX())
52211           return std::make_pair(0U, &X86::VR256RegClass);
52212         break;
52213       case MVT::v64i8:
52214       case MVT::v32i16:
52215       case MVT::v8f64:
52216       case MVT::v16f32:
52217       case MVT::v16i32:
52218       case MVT::v8i64:
52219         if (!Subtarget.hasAVX512()) break;
52220         if (VConstraint)
52221           return std::make_pair(0U, &X86::VR512RegClass);
52222         return std::make_pair(0U, &X86::VR512_0_15RegClass);
52223       }
52224       break;
52225     }
52226   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
52227     switch (Constraint[1]) {
52228     default:
52229       break;
52230     case 'i':
52231     case 't':
52232     case '2':
52233       return getRegForInlineAsmConstraint(TRI, "x", VT);
52234     case 'm':
52235       if (!Subtarget.hasMMX()) break;
52236       return std::make_pair(0U, &X86::VR64RegClass);
52237     case 'z':
52238       if (!Subtarget.hasSSE1()) break;
52239       switch (VT.SimpleTy) {
52240       default: break;
52241       // Scalar SSE types.
52242       case MVT::f32:
52243       case MVT::i32:
52244         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
52245       case MVT::f64:
52246       case MVT::i64:
52247         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
52248       case MVT::f128:
52249       case MVT::v16i8:
52250       case MVT::v8i16:
52251       case MVT::v4i32:
52252       case MVT::v2i64:
52253       case MVT::v4f32:
52254       case MVT::v2f64:
52255         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
52256       // AVX types.
52257       case MVT::v32i8:
52258       case MVT::v16i16:
52259       case MVT::v8i32:
52260       case MVT::v4i64:
52261       case MVT::v8f32:
52262       case MVT::v4f64:
52263         if (Subtarget.hasAVX())
52264           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
52265         break;
52266       case MVT::v64i8:
52267       case MVT::v32i16:
52268       case MVT::v8f64:
52269       case MVT::v16f32:
52270       case MVT::v16i32:
52271       case MVT::v8i64:
52272         if (Subtarget.hasAVX512())
52273           return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
52274         break;
52275       }
52276       break;
52277     case 'k':
52278       // This register class doesn't allocate k0 for masked vector operation.
52279       if (Subtarget.hasAVX512()) {
52280         if (VT == MVT::i1)
52281           return std::make_pair(0U, &X86::VK1WMRegClass);
52282         if (VT == MVT::i8)
52283           return std::make_pair(0U, &X86::VK8WMRegClass);
52284         if (VT == MVT::i16)
52285           return std::make_pair(0U, &X86::VK16WMRegClass);
52286       }
52287       if (Subtarget.hasBWI()) {
52288         if (VT == MVT::i32)
52289           return std::make_pair(0U, &X86::VK32WMRegClass);
52290         if (VT == MVT::i64)
52291           return std::make_pair(0U, &X86::VK64WMRegClass);
52292       }
52293       break;
52294     }
52295   }
52296 
52297   if (parseConstraintCode(Constraint) != X86::COND_INVALID)
52298     return std::make_pair(0U, &X86::GR32RegClass);
52299 
52300   // Use the default implementation in TargetLowering to convert the register
52301   // constraint into a member of a register class.
52302   std::pair<Register, const TargetRegisterClass*> Res;
52303   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
52304 
52305   // Not found as a standard register?
52306   if (!Res.second) {
52307     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
52308     // to/from f80.
52309     if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
52310       // Map st(0) -> st(7) -> ST0
52311       if (Constraint.size() == 7 && Constraint[0] == '{' &&
52312           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
52313           Constraint[3] == '(' &&
52314           (Constraint[4] >= '0' && Constraint[4] <= '7') &&
52315           Constraint[5] == ')' && Constraint[6] == '}') {
52316         // st(7) is not allocatable and thus not a member of RFP80. Return
52317         // singleton class in cases where we have a reference to it.
52318         if (Constraint[4] == '7')
52319           return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
52320         return std::make_pair(X86::FP0 + Constraint[4] - '0',
52321                               &X86::RFP80RegClass);
52322       }
52323 
52324       // GCC allows "st(0)" to be called just plain "st".
52325       if (StringRef("{st}").equals_insensitive(Constraint))
52326         return std::make_pair(X86::FP0, &X86::RFP80RegClass);
52327     }
52328 
52329     // flags -> EFLAGS
52330     if (StringRef("{flags}").equals_insensitive(Constraint))
52331       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
52332 
52333     // dirflag -> DF
52334     // Only allow for clobber.
52335     if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
52336         VT == MVT::Other)
52337       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
52338 
52339     // fpsr -> FPSW
52340     if (StringRef("{fpsr}").equals_insensitive(Constraint))
52341       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
52342 
52343     return Res;
52344   }
52345 
52346   // Make sure it isn't a register that requires 64-bit mode.
52347   if (!Subtarget.is64Bit() &&
52348       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
52349       TRI->getEncodingValue(Res.first) >= 8) {
52350     // Register requires REX prefix, but we're in 32-bit mode.
52351     return std::make_pair(0, nullptr);
52352   }
52353 
52354   // Make sure it isn't a register that requires AVX512.
52355   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
52356       TRI->getEncodingValue(Res.first) & 0x10) {
52357     // Register requires EVEX prefix.
52358     return std::make_pair(0, nullptr);
52359   }
52360 
52361   // Otherwise, check to see if this is a register class of the wrong value
52362   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
52363   // turn into {ax},{dx}.
52364   // MVT::Other is used to specify clobber names.
52365   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
52366     return Res;   // Correct type already, nothing to do.
52367 
52368   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
52369   // return "eax". This should even work for things like getting 64bit integer
52370   // registers when given an f64 type.
52371   const TargetRegisterClass *Class = Res.second;
52372   // The generic code will match the first register class that contains the
52373   // given register. Thus, based on the ordering of the tablegened file,
52374   // the "plain" GR classes might not come first.
52375   // Therefore, use a helper method.
52376   if (isGRClass(*Class)) {
52377     unsigned Size = VT.getSizeInBits();
52378     if (Size == 1) Size = 8;
52379     Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
52380     if (DestReg > 0) {
52381       bool is64Bit = Subtarget.is64Bit();
52382       const TargetRegisterClass *RC =
52383           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
52384         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
52385         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
52386         : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
52387         : nullptr;
52388       if (Size == 64 && !is64Bit) {
52389         // Model GCC's behavior here and select a fixed pair of 32-bit
52390         // registers.
52391         switch (DestReg) {
52392         case X86::RAX:
52393           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52394         case X86::RDX:
52395           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
52396         case X86::RCX:
52397           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
52398         case X86::RBX:
52399           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
52400         case X86::RSI:
52401           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
52402         case X86::RDI:
52403           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
52404         case X86::RBP:
52405           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
52406         default:
52407           return std::make_pair(0, nullptr);
52408         }
52409       }
52410       if (RC && RC->contains(DestReg))
52411         return std::make_pair(DestReg, RC);
52412       return Res;
52413     }
52414     // No register found/type mismatch.
52415     return std::make_pair(0, nullptr);
52416   } else if (isFRClass(*Class)) {
52417     // Handle references to XMM physical registers that got mapped into the
52418     // wrong class.  This can happen with constraints like {xmm0} where the
52419     // target independent register mapper will just pick the first match it can
52420     // find, ignoring the required type.
52421 
52422     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
52423     if (VT == MVT::f32 || VT == MVT::i32)
52424       Res.second = &X86::FR32XRegClass;
52425     else if (VT == MVT::f64 || VT == MVT::i64)
52426       Res.second = &X86::FR64XRegClass;
52427     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
52428       Res.second = &X86::VR128XRegClass;
52429     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
52430       Res.second = &X86::VR256XRegClass;
52431     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
52432       Res.second = &X86::VR512RegClass;
52433     else {
52434       // Type mismatch and not a clobber: Return an error;
52435       Res.first = 0;
52436       Res.second = nullptr;
52437     }
52438   } else if (isVKClass(*Class)) {
52439     if (VT == MVT::i1)
52440       Res.second = &X86::VK1RegClass;
52441     else if (VT == MVT::i8)
52442       Res.second = &X86::VK8RegClass;
52443     else if (VT == MVT::i16)
52444       Res.second = &X86::VK16RegClass;
52445     else if (VT == MVT::i32)
52446       Res.second = &X86::VK32RegClass;
52447     else if (VT == MVT::i64)
52448       Res.second = &X86::VK64RegClass;
52449     else {
52450       // Type mismatch and not a clobber: Return an error;
52451       Res.first = 0;
52452       Res.second = nullptr;
52453     }
52454   }
52455 
52456   return Res;
52457 }
52458 
52459 InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
52460                                                         const AddrMode &AM,
52461                                                         Type *Ty,
52462                                                         unsigned AS) const {
52463   // Scaling factors are not free at all.
52464   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
52465   // will take 2 allocations in the out of order engine instead of 1
52466   // for plain addressing mode, i.e. inst (reg1).
52467   // E.g.,
52468   // vaddps (%rsi,%rdx), %ymm0, %ymm1
52469   // Requires two allocations (one for the load, one for the computation)
52470   // whereas:
52471   // vaddps (%rsi), %ymm0, %ymm1
52472   // Requires just 1 allocation, i.e., freeing allocations for other operations
52473   // and having less micro operations to execute.
52474   //
52475   // For some X86 architectures, this is even worse because for instance for
52476   // stores, the complex addressing mode forces the instruction to use the
52477   // "load" ports instead of the dedicated "store" port.
52478   // E.g., on Haswell:
52479   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
52480   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
52481   if (isLegalAddressingMode(DL, AM, Ty, AS))
52482     // Scale represents reg2 * scale, thus account for 1
52483     // as soon as we use a second register.
52484     return AM.Scale != 0;
52485   return -1;
52486 }
52487 
52488 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
52489   // Integer division on x86 is expensive. However, when aggressively optimizing
52490   // for code size, we prefer to use a div instruction, as it is usually smaller
52491   // than the alternative sequence.
52492   // The exception to this is vector division. Since x86 doesn't have vector
52493   // integer division, leaving the division as-is is a loss even in terms of
52494   // size, because it will have to be scalarized, while the alternative code
52495   // sequence can be performed in vector form.
52496   bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
52497   return OptSize && !VT.isVector();
52498 }
52499 
52500 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
52501   if (!Subtarget.is64Bit())
52502     return;
52503 
52504   // Update IsSplitCSR in X86MachineFunctionInfo.
52505   X86MachineFunctionInfo *AFI =
52506       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
52507   AFI->setIsSplitCSR(true);
52508 }
52509 
52510 void X86TargetLowering::insertCopiesSplitCSR(
52511     MachineBasicBlock *Entry,
52512     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
52513   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
52514   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
52515   if (!IStart)
52516     return;
52517 
52518   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
52519   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
52520   MachineBasicBlock::iterator MBBI = Entry->begin();
52521   for (const MCPhysReg *I = IStart; *I; ++I) {
52522     const TargetRegisterClass *RC = nullptr;
52523     if (X86::GR64RegClass.contains(*I))
52524       RC = &X86::GR64RegClass;
52525     else
52526       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
52527 
52528     Register NewVR = MRI->createVirtualRegister(RC);
52529     // Create copy from CSR to a virtual register.
52530     // FIXME: this currently does not emit CFI pseudo-instructions, it works
52531     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
52532     // nounwind. If we want to generalize this later, we may need to emit
52533     // CFI pseudo-instructions.
52534     assert(
52535         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
52536         "Function should be nounwind in insertCopiesSplitCSR!");
52537     Entry->addLiveIn(*I);
52538     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
52539         .addReg(*I);
52540 
52541     // Insert the copy-back instructions right before the terminator.
52542     for (auto *Exit : Exits)
52543       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
52544               TII->get(TargetOpcode::COPY), *I)
52545           .addReg(NewVR);
52546   }
52547 }
52548 
52549 bool X86TargetLowering::supportSwiftError() const {
52550   return Subtarget.is64Bit();
52551 }
52552 
52553 /// Returns true if stack probing through a function call is requested.
52554 bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
52555   return !getStackProbeSymbolName(MF).empty();
52556 }
52557 
52558 /// Returns true if stack probing through inline assembly is requested.
52559 bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
52560 
52561   // No inline stack probe for Windows, they have their own mechanism.
52562   if (Subtarget.isOSWindows() ||
52563       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52564     return false;
52565 
52566   // If the function specifically requests inline stack probes, emit them.
52567   if (MF.getFunction().hasFnAttribute("probe-stack"))
52568     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
52569            "inline-asm";
52570 
52571   return false;
52572 }
52573 
52574 /// Returns the name of the symbol used to emit stack probes or the empty
52575 /// string if not applicable.
52576 StringRef
52577 X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
52578   // Inline Stack probes disable stack probe call
52579   if (hasInlineStackProbe(MF))
52580     return "";
52581 
52582   // If the function specifically requests stack probes, emit them.
52583   if (MF.getFunction().hasFnAttribute("probe-stack"))
52584     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
52585 
52586   // Generally, if we aren't on Windows, the platform ABI does not include
52587   // support for stack probes, so don't emit them.
52588   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
52589       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52590     return "";
52591 
52592   // We need a stack probe to conform to the Windows ABI. Choose the right
52593   // symbol.
52594   if (Subtarget.is64Bit())
52595     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
52596   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
52597 }
52598 
52599 unsigned
52600 X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
52601   // The default stack probe size is 4096 if the function has no stackprobesize
52602   // attribute.
52603   unsigned StackProbeSize = 4096;
52604   const Function &Fn = MF.getFunction();
52605   if (Fn.hasFnAttribute("stack-probe-size"))
52606     Fn.getFnAttribute("stack-probe-size")
52607         .getValueAsString()
52608         .getAsInteger(0, StackProbeSize);
52609   return StackProbeSize;
52610 }
52611 
52612 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
52613   if (ML->isInnermost() &&
52614       ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
52615     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
52616   return TargetLowering::getPrefLoopAlignment();
52617 }
52618